Saturday, August 5, 2017

Apache Spark K- Mean clustering

K.1 : K- Mean clustering:

This is how you run K-Mean clustering using Dataframes in Spark.

These are the libs that you need to import:
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
import scala.sys.process._

//Pull data from hive or any other DB ( for this example we assume that the data is stored in hive )

val query = "select query, impression from table_search"
//Now execute the queryval

q1 = hqlContext.sql(query).select("query", "page_impression")

val rowRDD = q1.rdd.map
(r =>
(
r.getString(0),
r.getLong(1),
Vectors.dense(r.getLong(1))
)
)

rowRDD.cache()

val upper_cnt =  8000
val lower_cnt = 100
//select the training set

val trainSet = rowRDD.filter(r => r._2 < upper_cnt && r._2 >= lower_cnt)
val numClusters = 3
val numIterations = 20
val feature = trainSet.map(r => r._3)

val kMeansModel = KMeans.train(feature, numClusters, numIterations)

//Here we are segregating all the queries in 3 seperate clusters

val predDF = trainSet.map(r => kMeansModel.predict(r._3) match {
  case 1 => (r._1, r._2, 0)
  case 2 => (r._1, r._2, 1)
  case 0 => (r._1, r._2, 2)
  case _ => (r._1, r._2, -1)
}).toDF("queries", "page_impression", "tier")


//Now saving the data in a
hdfspredDF.coalesce(10).write.mode(SaveMode.Overwrite).parquet(final_part_dir)




K.2: Logistic regression example:

val data = Seq(  (1.0, .52,0.34),  (0.0, 0.6, 0.43),  (0.0, 1.9, 0.54),  (1.0, 0.11, 0.11),  (1.0, 0.222, 0.33),  (1.0, 0.333, 0.99),  (0.0, 0.314, 0.86),  (0.0, 0.9888, 0.34),  (1.0, 0.264, 0.55))

val df = sc.parallelize(data).toDF("label", "x1", "x2").cache()

val train_set = df.rdd.map(row => LabeledPoint(row.getAs[Double](0), Vectors.dense (row.getAs[Double](1), row.getAs[Double](2)))).toDF

train_set.show()

val lr =new LogisticRegression().setMaxIter(10).setRegParam(0.3)

val lrModel = lr.fit(train_set)

//Prepare the test data
val test = df.select ("x1", "x2").rdd.map(row => Feature(Vectors.dense(row.getAs[Double](0), row.getAs[Double](1) ) ) ).toDF("features")

val predictionsDF = lrModel.transform (test)
predictionsDF.show()


0 comments:

Post a Comment