简单版本(计算词频 并取前三)
/**
* 计算词频 并取前三
*/
object WordCountSimple {
def main(args: Array[String]): Unit = {
val list:List[String] = List(
"hello",
"hello world",
"hello world by spark",
"hello world by flink",
"hello scala",
"scala"
)
// 计算词频
val words1 = taskThirdWords1(list)
val words2 = taskThirdWords2(list)
println(words1)
println("=======================================")
println(words2)
}
def taskThirdWords1(words: List[String]):List[(String, Int)] = {
// 1. 按照空格拆词
val listT: List[Array[String]] = words.map(_.split(" "))
// 2. flat 处理 扁平化词
val flatten:List[String] = listT.flatten
// 3. 按词分组 并计算词频
val mapGroup:Map[String, List[String]] = flatten.groupBy(word => word)
val wordCountMap:Map[String, Int] = mapGroup.map(kv => (kv._1, kv._2.length))
// 4. 转换为 List Tuple
val wordCountList:List[(String, Int)] = wordCountMap.toList
// 5. 倒序排序
//val wordCountListBySort:List[(String, Int)] = wordCountList.sortWith(_._2 > _._2)
val wordCountListBySort:List[(String, Int)] = wordCountList.sortBy(_._2)(Ordering[Int].reverse)
// 5. 取前三
wordCountListBySort.take(3)
}
/**
* 简化版本
* @param words
* @return
*/
def taskThirdWords2(words: List[String]):List[(String, Int)] = {
words.flatMap(_.split(" "))
.groupBy(word => word)
.map(kv => (kv._1, kv._2.length))
.toList
.sortWith(_._2 > _._2)
.take(3)
}
}
复杂版本(计算词频 并取前三)
object WordCountComplex {
def main(args: Array[String]): Unit = {
val list: List[(String, Int)] = List(
("hello", 2),
("hello world", 1),
("hello world by spark", 4),
("hello world by flink", 3),
("hello scala", 8),
("scala", 1)
)
// 计算词频
val words1 = taskThirdWords1(list)
val words2 = taskThirdWords2(list)
println(words1)
println("=======================================")
println(words2)
}
/**
* 策略是 忽略已统计结果 放大次数
* @param words
* @return
*/
def taskThirdWords1(words: List[(String, Int)]): List[(String, Int)] = {
words
// 放大词频 到正常频率
.map( t => (t._1 + " ") * t._2)
// 按照空格分词 并扁平化处理
.flatMap(_.split(" "))
// 按照单词分组
.groupBy(word => word)
// 转换词频
.map(kv => (kv._1, kv._2.length))
// 转换成List
.toList
// 排序
.sortWith(_._2 > _._2)
// 取前三
.take(3)
}
/**
* 策略是 忽略已统计结果 放大次数
* @param words
* @return
*/
def taskThirdWords2(words: List[(String, Int)]): List[(String, Int)] = {
words.flatMap(kv => {
val arr: Array[String] = kv._1.split(" ")
// 最终会变成 ("hello", 3)
arr.map(word => (word, kv._2))
})
.groupBy(_._1)
.mapValues(tupleList => tupleList.map(_._2).sum)
.toList
.sortWith(_._2 > _._2)
.take(3)
}
}