主題介紹

TF-IDF 為一種文字探勘處理的工具,使用 TF 詞頻 (Term Frequency)和 IDF 逆向文件頻率 (Inverse Document Frequent)的乘積,得出權重(Weight),讓使用的人可以在多個文件中找到頻詞高,且在各個文件中“較為獨特”的文字。

而在這次的練習中,我參考了助教給的網站的概念,觀察ptt鄉民對於今年三位MVP呼聲最高球員,Curry, Lebron, Harden,

看看他們分別被討論的關注度和時間點,以及鄉民們對他們有著什麼樣的回應和評論,那就讓我們繼續往下看吧~~

1.載入套件包

library(bitops)
library(httr)
library(RCurl)
library(XML)
library(NLP)
library(tm)
library(tmcn)
library(jiebaRD)
library(jiebaR)
library(dplyr)

2.爬下所需的Ptt網站網址和標題

data <- list()
title <- list()
date <- list()

#抓取三月ptt NBA版的貼文 
for( i in c(5702:5794)){
  tmp <- paste(i, '.html', sep='')
  url <- paste('www.ptt.cc/bbs/NBA/index', tmp, sep='')
  html <- htmlParse(GET(url),encoding = "UTF-8")
  title.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlValue)
  url.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlAttrs)
  date.list <- xpathSApply(html, "//div[@class='meta']/div[@class='date']", xmlValue)
  data <- rbind(data, as.matrix(paste('www.ptt.cc', url.list, sep='')))
  title <- rbind(title, as.matrix(title.list))
  date <- rbind(date, as.matrix(date.list))
}

# data 存網址  title 存標題 #date 存時間
data <- unlist(data)
title <- unlist(title)
date <- unlist(date)

遇到的小問題(已解決)

原本用getURL 結果發現url.list裡面是NULL 改用GET後就正確了

3.找出有相對應關鍵字的標題

Lebron <- c()
Lebron.url <- c()
Lebron.date <- c()

# 找出有關鍵字的標題並分類
lebron1 <- grep("James", title)
lebron2 <- grep("LBJ", title)
lebron3 <- grep("姆斯", title)

Lebron <- c(Lebron,title[lebron1])
Lebron <- c(Lebron,title[lebron2])
Lebron <- c(Lebron,title[lebron3])

Lebron.url <- c(Lebron.url, data[lebron1])
Lebron.url <- c(Lebron.url, data[lebron2])
Lebron.url <- c(Lebron.url, data[lebron3])

Lebron.date <- c(Lebron.date, date[lebron1])
Lebron.date <- c(Lebron.date, date[lebron2])
Lebron.date <- c(Lebron.date, date[lebron3])

Curry <- c()
Curry.url <- c()
Curry.date <- c()

# 找出有關鍵字的標題並分類
curry1 <- grep("Curry", title)
curry2 <- grep("柯瑞", title)
curry3 <- grep("咖哩", title)

Curry <- c(Curry,title[curry1])
Curry <- c(Curry,title[curry2])
Curry <- c(Curry,title[curry3])

Curry.url <- c(Curry.url, data[curry1])
Curry.url <- c(Curry.url, data[curry2])
Curry.url <- c(Curry.url, data[curry3])

Curry.date <- c(Curry.date, date[curry1])
Curry.date <- c(Curry.date, date[curry2])
Curry.date <- c(Curry.date, date[curry3])

Harden <- c()
Harden.url <- c()
Harden.date <- c()

# 找出有關鍵字的標題並分類
harden1 <- grep("Harden", title)
harden2 <- grep("哈登", title)

Harden <- c(Harden,title[harden1])
Harden <- c(Harden,title[harden2])

Harden.url <- c(Harden.url, data[harden1])
Harden.url <- c(Harden.url, data[harden2])

Harden.date <- c(Harden.date, date[harden1])
Harden.date <- c(Harden.date, date[harden2])

4. 留言擷取 分類 文本清理和建立文本矩陣 TermDocumentMatrix

message <- list()
cc = worker()
LBJTDF <- data.frame()
SCTDF <- data.frame()
JHTDF <- data.frame()
player <- c()
postdate <- c()
hot <- c()

#爬取每篇有姆斯 LBJ 或是James 文章的留言
for(i in c(1:length(Lebron))){
  html <- htmlParse(GET(Lebron.url[i]),encoding = "UTF-8")
  message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
  message <- unlist(message.list)
  
  #日期分類
  player <- c(player,"LBJ")
  if(grepl("3/0",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/01~3/09") }
  else if(grepl("3/1",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/10~3/19")}
  else if(grepl("3/2",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/20~3/29")}
  else if(grepl("3/3",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/30~3/31")}
 
  
  if(length(message) > 100){
    hot <- c(hot, "Boom!")}
  else if(length(message) > 75){
    hot <- c(hot, "Hot")}
  else if(length(message) > 50){
    hot <- c(hot, "Soso")}
  else{
    hot <- c(hot, "Cold") }
  
  
  
  #文本清理
  d.corpus <- VCorpus( VectorSource(message) )
  d.corpus <- tm_map(d.corpus, removePunctuation)
  d.corpus <- tm_map(d.corpus, removeNumbers)
  d.corpus <- tm_map(d.corpus, function(word) {
    gsub("[A-Za-z0-9]", "", word)
  })
  
  #斷詞
  abc <- data.frame(table(cc[as.character(d.corpus)]))
  colnames(abc) <- c("word", as.character(i))
  
  #合併
  if(i == 1){
    LBJTDF <- abc}
  else{
    LBJTDF <- merge(LBJTDF, abc, by = "word", all = T)}
}

#爬取每篇有柯瑞 咖哩 或是Curry 文章的留言
for(i in c(1:length(Curry))){
  html <- htmlParse(GET(Curry.url[i]),encoding = "UTF-8")
  message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
  message <- unlist(message.list)
  
  #日期分類
  player <- c(player,"Curry")
  if(grepl("3/0",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/01~3/09") }
  else if(grepl("3/1",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/10~3/19")}
  else if(grepl("3/2",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/20~3/29")}
  else if(grepl("3/3",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/30~3/31")}

 
  #人氣分類
  if(length(message) > 100){
    hot <- c(hot, "Boom!")}
  else if(length(message) > 75){
    hot <- c(hot, "Hot")}
  else if(length(message) > 50){
    hot <- c(hot, "Soso")}
  else{
    hot <- c(hot, "Cold") }
  
   #文本清理
  d.corpus <- VCorpus( VectorSource(message) )
  d.corpus <- tm_map(d.corpus, removePunctuation)
  d.corpus <- tm_map(d.corpus, removeNumbers)
  d.corpus <- tm_map(d.corpus, function(word) {
    gsub("[A-Za-z0-9]", "", word)
  })
  
  #斷詞
  abc <- data.frame(table(cc[as.character(d.corpus)]))
  colnames(abc) <- c("word", as.character(i))
  
  #合併
  if(i == 1){
    SCTDF <- abc}
  else{
    SCTDF <- merge(SCTDF, abc, by = "word", all = T)}
}

#爬取每篇有哈登或是Harden文章的留言
for(i in c(1:length(Harden))){
  html <- htmlParse(GET(Harden.url[i]),encoding = "UTF-8")
  message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
  message <- unlist(message.list)
  
  #日期分類
  player <- c(player,"Harden")
  if(grepl("3/0",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/01~3/09") }
  else if(grepl("3/1",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/10~3/19")}
  else if(grepl("3/2",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/20~3/29")}
  else if(grepl("3/3",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/30~3/31")}
  
  #人氣分類
  if(length(message) > 100){
    hot <- c(hot, "Boom!")}
  else if(length(message) > 75){
    hot <- c(hot, "Hot")}
  else if(length(message) > 50){
    hot <- c(hot, "Soso")}
  else{
    hot <- c(hot, "Cold") }
  
  #文本清理
  d.corpus <- VCorpus( VectorSource(message) )
  d.corpus <- tm_map(d.corpus, removePunctuation)
  d.corpus <- tm_map(d.corpus, removeNumbers)
  d.corpus <- tm_map(d.corpus, function(word) {
    gsub("[A-Za-z0-9]", "", word)
  })
  
  #斷詞
  abc <- data.frame(table(cc[as.character(d.corpus)]))
  colnames(abc) <- c("word", as.character(i))
  
  #合併
  if(i == 1){
    JHTDF <- abc}
  else{
    JHTDF <- merge(JHTDF, abc, by = "word", all = T)}
}



#將遺漏值設為 0
LBJTDF[is.na(LBJTDF)] <- 0
JHTDF[is.na(JHTDF)] <- 0
SCTDF[is.na(SCTDF)] <- 0

library(knitr)
kable(head(LBJTDF))
word 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
2 0 6 6 0 1 4 0 0 2 16 10 3 1 1 1 0 3 2 3 1 0 0 2 0 1 0 2 7 0 22 2 0 0 0 4 2 4 0 2 4 1 1 1 13 1 1 0 2 0 7 2 0 6 1 0 0 0 2 1 1 0 3 0 0 0 0 5 2 1 0 1 4 4 3 0 2 1 1 3 1 2 1 1 1 0 0 2 0 0 7 0 5 0 2 4 8 0 1
1 7 12 13 0 1 2 1 1 2 46 40 13 1 2 4 2 5 6 2 2 4 0 15 2 4 3 12 20 0 57 3 0 1 2 1 5 18 4 5 1 3 8 4 15 2 6 1 2 9 4 14 2 14 4 5 0 1 2 0 0 3 0 3 4 1 5 2 8 0 1 5 5 21 6 8 1 2 2 5 6 3 0 5 4 8 0 2 0 2 4 2 5 5 9 6 5 3 11
2 16 7 4 0 2 5 0 3 0 35 25 7 9 1 5 2 6 3 3 3 5 1 18 1 1 0 8 7 1 19 5 0 2 1 3 5 5 1 7 5 1 3 5 6 4 3 0 8 2 8 22 0 17 2 6 1 0 1 0 2 0 5 6 1 1 8 4 2 3 1 3 6 9 9 6 1 5 4 4 4 11 4 2 0 2 2 0 2 0 9 0 5 7 7 7 13 9 7
白鬍子 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
辦法 1 0 1 2 0 0 0 0 0 0 4 3 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 3 0 0 0 0 0 1 0 0 0 1 0 0 0 3 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 2 1
3 2 20 8 0 0 4 5 6 5 39 22 14 3 4 3 0 2 8 6 1 8 0 13 1 2 2 2 19 1 28 0 1 5 2 1 11 19 2 2 0 4 3 3 17 3 3 1 3 5 10 9 0 19 8 14 0 0 1 0 0 4 5 4 4 1 4 2 7 1 4 0 2 12 7 7 4 3 1 1 14 3 3 10 2 8 0 3 1 0 29 0 22 2 2 5 5 6 9
kable(head(SCTDF))
word 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
1 8 0 1 0 0 0 1 1 2 0 2 3 8 1 0 1 0 1 0 1 1 16 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 6 1 0 1 5 2 2 0 0 1 0 0
4 37 2 4 0 3 1 2 2 2 0 3 8 16 2 2 4 5 1 1 10 10 18 3 1 1 4 2 3 0 2 4 2 1 0 2 1 4 14 4 3 1 5 3 1 0 1 4 0 0
1 9 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0
暗號 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 23 1 2 1 9 9 0 3 2 0 2 14 24 2 3 6 5 5 0 5 3 19 3 3 0 9 4 1 0 0 8 3 6 0 0 1 1 20 14 6 9 8 3 3 0 1 1 0 1
白癡 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
kable(head(JHTDF))
word 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
1 0 0 2 0 0 1 2 0 0 0 1 0 0 1 7 1 0 3 3 2 6 0 0 0 2 0 0 0 0 0 0 0 0 8 0 1 5 1
1 3 1 1 0 0 1 4 0 1 4 6 1 1 8 13 8 5 11 8 4 11 1 3 5 2 4 0 2 1 0 0 1 1 10 0 9 6 2
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 1 0 2 0 1 0 0 1 1 2 0 2 1 0 4 0 3 3 1 0 0 0 1 0 0 2 1 3 0 4 2 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
爸爸 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5. 將已建好的 TDM 轉成 TF-IDF

n <- length(Lebron)
tf1 <- apply(as.matrix(LBJTDF[,2:(n+1)]), 2, sum)
library(Matrix)
idfCal1 <- function(word_doc)
{ 
  log2( n / nnzero(word_doc) ) 
}
idf1 <- apply(as.matrix(LBJTDF[,2:(n+1)]), 1, idfCal1)

doc1.tfidf <- LBJTDF

for(x in 1:nrow(LBJTDF))
{
  for(y in 2:ncol(LBJTDF))
  {
    doc1.tfidf[x,y] <- (doc1.tfidf[x,y] / tf1[y-1]) * idf1[x]
  }
}

n <- length(Curry)
tf2 <- apply(as.matrix(SCTDF[,2:(n+1)]), 2, sum)

idfCal2 <- function(word_doc)
{ 
  log2( n / nnzero(word_doc) ) 
}
idf2 <- apply(as.matrix(SCTDF[,2:(n+1)]), 1, idfCal2)

doc2.tfidf <- SCTDF

for(x in 1:nrow(SCTDF))
{
  for(y in 2:ncol(SCTDF))
  {
    doc2.tfidf[x,y] <- (doc2.tfidf[x,y] / tf2[y-1]) * idf2[x]
  }
}

n <- length(Harden)
tf3 <- apply(as.matrix(JHTDF[,2:(n+1)]), 2, sum)
idfCal3 <- function(word_doc)
{ 
  log2( n / nnzero(word_doc) ) 
}
idf3 <- apply(as.matrix(JHTDF[,2:(n+1)]), 1, idfCal3)

doc3.tfidf <- JHTDF

for(x in 1:nrow(JHTDF))
{
  for(y in 2:ncol(JHTDF))
  {
    doc3.tfidf[x,y] <- (doc3.tfidf[x,y] / tf3[y-1]) * idf3[x]
  }
}

6.列出文章留言中的最重要詞彙

6-1 Lebron James 文章重要詞彙

topwords <- subset(head(doc1.tfidf[order(doc1.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc1.tfidf))){
  topwords <- cbind(topwords, head(doc1.tfidf[order(doc1.tfidf[i], decreasing = TRUE),])[1])
}


AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]

kable(head(AllTop))
Var1 Freq
389 曇花 7
20 保養 5
396 體能 5
80 得分 4
335 人品 4
402 統治力 4

6-2 Stephen Curry 文章重要詞彙

topwords <- subset(head(doc2.tfidf[order(doc2.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc2.tfidf))){
  topwords <- cbind(topwords, head(doc2.tfidf[order(doc2.tfidf[i], decreasing = TRUE),])[1])
}

AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]

kable(head(AllTop))
Var1 Freq
21 笨拙 5
17 報應 3
31 不讓 3
32 不是故意 3
105 火箭 3
257 早日康復 3

6-3 James Harden 文章重要詞彙

topwords <- subset(head(doc3.tfidf[order(doc3.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc3.tfidf))){
  topwords <- cbind(topwords, head(doc3.tfidf[order(doc3.tfidf[i], decreasing = TRUE),])[1])
}

AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]

kable(head(AllTop))
Var1 Freq
53 犯規 4
200 戰績 3
211 走步 3
2 阿泰 2
36 單打 2
42 地雷 2

7. 發文日期和發文量

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
df1 <- data.frame(player,postdate,hot)

names(df1) <- c("Player", "Date", "Popularity")
ggplot(df1, aes(Date,fill = Player)) + geom_bar(position="dodge")

8. 球員被討論的熱門程度

ggplot(df1, aes(Player,fill = Popularity)) + geom_bar(position="dodge") 

9. 結論

這次TFIDF的分析 讓我看到 Lebron Harden Curry這三位現今NBA頂級球員 在台灣最大的論壇ptt中 鄉民們最常以甚麼詞語來評論他們 也可以從這些詞語中看出 最近他們身處什麼議題 和最近的表現如何

像是Lebron的體能 統治力 得分 可知道即使現在他已經33歲了 仍然是NBA的賽場上的霸主

而Curry的早日康復 知道最近Curry受到傷勢困擾 也造成他在3月底的關注度減少(3/8 3/22為其受傷的日子 故文章較多)

最後 Harden的犯規 戰績 走步 也很符合他一貫的形象 也知道今年火箭的戰績(西區第一)廣大被鄉民討論

而在爬蟲的過程中 我也將有關球員文章的留言數 知道這三位好手的人氣

儘管他們三人都非常紅 但很明顯 Lebron James仍然是目前NBA的第一人(從發文數跟留言量都可知)

硬要說 TFIDF 有什麼缺點 可能是在當資料過多(像此次處理一個月的文章)時 繁複的計算會導致時間偏慢 但其演算法去除贅字的能力 仍讓我大開眼界 希望將來能有更多的時間 對這個演算法有更深入的研究!