A growing post which gathers short pieces of code
Let’s suppose spark to be an opened spark session.
# Open a spark session
spark = SparkSession \
.builder.config(conf=conf) \
.appName("code-test").getOrCreate()How to merge 2 spark DataFrame
- Python:
# Id is the column on which we want to merge 2 DataFrame
dataset1 = spark.read.csv("../input/data1.csv", header="true",inferSchema="true")
dataset2 = spark.read.csv("../input/data2.csv", header="true",inferSchema="true")
dataset= dataset1.join(dataset2, dataset1.Id ==dataset2.Id).drop(dataset1.Id)- Scala:
//Id is the column on which we want to merge 2 DataFrame
val dataset1 = spark.read.option("header", "true").option("inferSchema", true).csv(inputTestPath + "/input/dataset1.csv")
val dataset2 = spark.read.option("header", "true").option("inferSchema", true).csv(inputTestPath + "/input/dataset2.csv")
val dataset = datset1.join(dataset1, dataset1("Id") === dataset2("Id")).drop(dataset1.col("Id"))
How to perform Cross Validation using spark ML in python
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=100, seed=77)
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [9,10,11]).addGrid(gbt.stepSize, [0.1,0.01,0.3]).build()
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[gbt])
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="accuracy")
# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds=3)
model = crossval.fit(trainingData)
# Fetch best model
best_model = model.bestModel


