This is how you run K-Mean clustering using Dataframes in Spark.
These are the libs that you need to import:
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
import scala.sys.process._
//Pull data from hive or any other DB ( for this example we assume that the data is stored in hive )
val query = "select query, impression from table_search"
//Now execute the queryval
q1 = hqlContext.sql(query).select("query", "page_impression")
val rowRDD = q1.rdd.map
(r =>
(
r.getString(0),
r.getLong(1),
Vectors.dense(r.getLong(1))
)
)
rowRDD.cache()
val upper_cnt = 8000
val lower_cnt = 100
//select the training set
val trainSet = rowRDD.filter(r => r._2 < upper_cnt && r._2 >= lower_cnt)
val numClusters = 3
val numIterations = 20
val feature = trainSet.map(r => r._3)
val kMeansModel = KMeans.train(feature, numClusters, numIterations)
//Here we are segregating all the queries in 3 seperate clusters
val predDF = trainSet.map(r => kMeansModel.predict(r._3) match {
case 1 => (r._1, r._2, 0)
case 2 => (r._1, r._2, 1)
case 0 => (r._1, r._2, 2)
case _ => (r._1, r._2, -1)
}).toDF("queries", "page_impression", "tier")
//Now saving the data in a
hdfspredDF.coalesce(10).write.mode(SaveMode.Overwrite).parquet(final_part_dir)
K.2: Logistic regression example:
val data = Seq( (1.0, .52,0.34), (0.0, 0.6, 0.43), (0.0, 1.9, 0.54), (1.0, 0.11, 0.11), (1.0, 0.222, 0.33), (1.0, 0.333, 0.99), (0.0, 0.314, 0.86), (0.0, 0.9888, 0.34), (1.0, 0.264, 0.55))
val df = sc.parallelize(data).toDF("label", "x1", "x2").cache()
val train_set = df.rdd.map(row => LabeledPoint(row.getAs[Double](0), Vectors.dense (row.getAs[Double](1), row.getAs[Double](2)))).toDF
train_set.show()
val lr =new LogisticRegression().setMaxIter(10).setRegParam(0.3)
val lrModel = lr.fit(train_set)
//Prepare the test data
val test = df.select ("x1", "x2").rdd.map(row => Feature(Vectors.dense(row.getAs[Double](0), row.getAs[Double](1) ) ) ).toDF("features")
val predictionsDF = lrModel.transform (test)
predictionsDF.show()
0 comments:
Post a Comment