val conf = new SparkConf().setMaster("local").setAppName("Sample")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(1 to 10)
val samle1 = rdd.sample(true,0.8,0)
samle1.foreach(x => print(x+" "))
sc.stop()
val conf = new SparkConf().setMaster("local").setAppName("union")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(1 to 4)
val rdd2 = sc.parallelize(4 to 8)
val unionRDD = rdd1.union(rdd2)
unionRDD.foreach(x => print(x + " "))
sc.stop()
val rdd = sc.parallelize(1 to 16 ,4)
println("重新分区前的分区个数"+rdd.partitions.size)
val coalesceRDD = rdd.coalesce(3,false)
println("重新分区后的分区个数:"+coalesceRDD.partitions.size)
重新分区前的分区个数4 重新分区后的分区个数:3
val rdd = sc.parallelize(1 to 16 ,4)
println("重新分区前的分区个数"+rdd.partitions.size)
val coalesceRDD = rdd.coalesce(7,true)
println("重新分区后的分区个数:"+coalesceRDD.partitions.size)
重新分区前的分区个数4 重新分区后的分区个数:7
重新分区前的分区个数4 重新分区后的分区个数:4
val rdd = sc.parallelize(1 to 6,2)
val glomRDD = rdd.glom()
glomRDD.foreach(x => println(x.getClass.getSimpleName))
int[]
int[]
val rdd = sc.parallelize(1 to 10)
val randomSplitRDD = rdd.randomSplit(Array(1.0,3.0,6.0))
randomSplitRDD(0).foreach(x => print(x +" "))
randomSplitRDD(1).foreach(x => print(x +" "))
randomSplitRDD(2).foreach(x => print(x +" "))
8 10
2 6 9
1 3 4 5 7
val list = List(("zhangsan", 22), ("lisi", 20), ("wangwu", 23))
val rdd = sc.parallelize(list)
val mapValuesRDD = rdd.mapValues(_ + 2)
mapValuesRDD.foreach(println)
sc.stop()
(zhangsan,24)
(lisi,22)
(wangwu,25)
val rdd = sc.parallelize(List(("zhangsan", 22), ("lisi", 20), ("wangwu", 23)))
val flatMapValues = rdd.flatMapValues(x=> Seq(x,"male"))
flatMapValues.foreach(println)
val data = List(("A", 3), ("A", 2), ("B", 1), ("B", 3))
val rdd1 = src.parallelize(data,2)
val reduceByKeyRDD = rdd1.reduceByKey(_ * _,2)
reduceByKeyRDD.foreach(println)
println(reduceByKeyRDD.partitions.size)
(B,3)
(A,6)
2
val data = List(("A",1),("B",2),("A",2),("B",3))
val rdd1 = src.parallelize(data,2)
val groupByKeyRDD = rdd1.groupByKey(2)
groupByKeyRDD.foreach(println)
(B,CompactBuffer(2, 3))
(A,CompactBuffer(1, 2))
val data = List(("A",1),("B",2),("A",2),("B",3))
val rdd1 = src.parallelize(data,2)
val sortByKeyRDD = rdd1.sortByKey(false,2)
sortByKeyRDD.foreach(println)