Spark Streaming : Word Count Example
Spark Streaming makes it easy to build scalable fault-tolerant streaming applications.
Spark Stream API is a near real time streaming it supports Java, Scala, Python and R.
Spark Scala code
import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage StreamingWordCount <input-directory> <output-directory>") System.exit(0) } val inputDir=args(0) val output=args(1) val conf = new SparkConf().setAppName("Spark Streaming Example") val streamingContext = new StreamingContext(conf, Seconds(10)) val lines = streamingContext.textFileStream(inputDir) val words = lines.flatMap(_.split(" ")) val wc = words.map(x => (x, 1)) wc.foreachRDD(rdd => { val counts = rdd.reduceByKey((x, y) => x + y) counts.saveAsTextFile(output) val collectedCounts = counts.collect collectedCounts.foreach(c => println(c)) } ) println("StreamingWordCount: streamingContext start") streamingContext.start() println("StreamingWordCount: await termination") streamingContext.awaitTermination() println("StreamingWordCount: done!") } }