Scala 实现 WordCount 去词频(学习)

Scala 实现 WordCount 去词频(学习)

简单版本(计算词频 并取前三)

/**
 * 计算词频 并取前三
 */
object WordCountSimple {

  def main(args: Array[String]): Unit = {

    val list:List[String] = List(
      "hello",
      "hello world",
      "hello world by spark",
      "hello world by flink",
      "hello scala",
      "scala"
    )


    // 计算词频

    val words1 = taskThirdWords1(list)

    val words2 = taskThirdWords2(list)
    println(words1)
    println("=======================================")
    println(words2)

  }

  def taskThirdWords1(words: List[String]):List[(String, Int)] = {
    // 1. 按照空格拆词
    val listT: List[Array[String]]  = words.map(_.split(" "))

    // 2. flat 处理 扁平化词
    val flatten:List[String] = listT.flatten

    // 3. 按词分组 并计算词频
    val mapGroup:Map[String, List[String]] = flatten.groupBy(word => word)
    val wordCountMap:Map[String, Int] = mapGroup.map(kv => (kv._1, kv._2.length))

    // 4. 转换为 List Tuple
    val wordCountList:List[(String, Int)] = wordCountMap.toList

    // 5. 倒序排序
    //val wordCountListBySort:List[(String, Int)] = wordCountList.sortWith(_._2 > _._2)
    val wordCountListBySort:List[(String, Int)] = wordCountList.sortBy(_._2)(Ordering[Int].reverse)

    // 5. 取前三
    wordCountListBySort.take(3)
  }

  /**
   * 简化版本
   * @param words
   * @return
   */
  def taskThirdWords2(words: List[String]):List[(String, Int)] = {
    words.flatMap(_.split(" "))
      .groupBy(word => word)
      .map(kv => (kv._1, kv._2.length))
      .toList
      .sortWith(_._2 > _._2)
      .take(3)
  }

}

复杂版本(计算词频 并取前三)

object WordCountComplex {

  def main(args: Array[String]): Unit = {
    val list: List[(String, Int)] = List(
      ("hello", 2),
      ("hello world", 1),
      ("hello world by spark", 4),
      ("hello world by flink", 3),
      ("hello scala", 8),
      ("scala", 1)
    )

    // 计算词频

    val words1 = taskThirdWords1(list)

    val words2 = taskThirdWords2(list)
    println(words1)
    println("=======================================")
    println(words2)

  }

  /**
   * 策略是 忽略已统计结果 放大次数
   * @param words
   * @return
   */
  def taskThirdWords1(words: List[(String, Int)]): List[(String, Int)] = {
    words
      // 放大词频 到正常频率
      .map( t => (t._1 + " ") * t._2)
      // 按照空格分词 并扁平化处理
      .flatMap(_.split(" "))
      // 按照单词分组
      .groupBy(word => word)
      // 转换词频
      .map(kv => (kv._1, kv._2.length))
      // 转换成List
      .toList
      // 排序
      .sortWith(_._2 > _._2)
      // 取前三
      .take(3)
  }


  /**
   * 策略是 忽略已统计结果 放大次数
   * @param words
   * @return
   */
  def taskThirdWords2(words: List[(String, Int)]): List[(String, Int)] = {
    words.flatMap(kv => {
        val arr: Array[String] = kv._1.split(" ")
        // 最终会变成 ("hello", 3)
        arr.map(word => (word, kv._2))
      })
      .groupBy(_._1)
      .mapValues(tupleList => tupleList.map(_._2).sum)
      .toList
      .sortWith(_._2 > _._2)
      .take(3)
  }

}

本文由 在码圈 创作,如果您觉得本文不错,请随意赞赏
采用 知识共享署名4.0 国际许可协议进行许可
您可以自由的转载和修改,但请务必注明文章来源并且不可用于商业目的。
本站部分内容收集于互联网,如果有侵权内容、不妥之处,请联系我们删除。敬请谅解!
原文链接:https://www.bedebug.com/archives/wordcount
最后更新于:2023-11-25 15:59:50

请博主喝咖啡 ☕.