主題介紹

TF-IDF 為一種文字探勘處理的工具，使用 TF 詞頻 (Term Frequency)和 IDF 逆向文件頻率 (Inverse Document Frequent)的乘積，得出權重(Weight)，讓使用的人可以在多個文件中找到頻詞高，且在各個文件中“較為獨特”的文字。

而在這次的練習中，我參考了助教給的網站的概念，觀察ptt鄉民對於今年三位MVP呼聲最高球員，Curry, Lebron, Harden，

看看他們分別被討論的關注度和時間點，以及鄉民們對他們有著什麼樣的回應和評論，那就讓我們繼續往下看吧～～

1.載入套件包

library(bitops)
library(httr)
library(RCurl)
library(XML)
library(NLP)
library(tm)
library(tmcn)
library(jiebaRD)
library(jiebaR)
library(dplyr)

2.爬下所需的Ptt網站網址和標題

data <- list()
title <- list()
date <- list()

#抓取三月ptt NBA版的貼文 
for( i in c(5702:5794)){
  tmp <- paste(i, '.html', sep='')
  url <- paste('www.ptt.cc/bbs/NBA/index', tmp, sep='')
  html <- htmlParse(GET(url),encoding = "UTF-8")
  title.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlValue)
  url.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlAttrs)
  date.list <- xpathSApply(html, "//div[@class='meta']/div[@class='date']", xmlValue)
  data <- rbind(data, as.matrix(paste('www.ptt.cc', url.list, sep='')))
  title <- rbind(title, as.matrix(title.list))
  date <- rbind(date, as.matrix(date.list))
}

# data 存網址  title 存標題 #date 存時間
data <- unlist(data)
title <- unlist(title)
date <- unlist(date)

遇到的小問題(已解決)

原本用getURL 結果發現url.list裡面是NULL 改用GET後就正確了

3.找出有相對應關鍵字的標題

Lebron <- c()
Lebron.url <- c()
Lebron.date <- c()

# 找出有關鍵字的標題並分類
lebron1 <- grep("James", title)
lebron2 <- grep("LBJ", title)
lebron3 <- grep("姆斯", title)

Lebron <- c(Lebron,title[lebron1])
Lebron <- c(Lebron,title[lebron2])
Lebron <- c(Lebron,title[lebron3])

Lebron.url <- c(Lebron.url, data[lebron1])
Lebron.url <- c(Lebron.url, data[lebron2])
Lebron.url <- c(Lebron.url, data[lebron3])

Lebron.date <- c(Lebron.date, date[lebron1])
Lebron.date <- c(Lebron.date, date[lebron2])
Lebron.date <- c(Lebron.date, date[lebron3])

Curry <- c()
Curry.url <- c()
Curry.date <- c()

# 找出有關鍵字的標題並分類
curry1 <- grep("Curry", title)
curry2 <- grep("柯瑞", title)
curry3 <- grep("咖哩", title)

Curry <- c(Curry,title[curry1])
Curry <- c(Curry,title[curry2])
Curry <- c(Curry,title[curry3])

Curry.url <- c(Curry.url, data[curry1])
Curry.url <- c(Curry.url, data[curry2])
Curry.url <- c(Curry.url, data[curry3])

Curry.date <- c(Curry.date, date[curry1])
Curry.date <- c(Curry.date, date[curry2])
Curry.date <- c(Curry.date, date[curry3])

Harden <- c()
Harden.url <- c()
Harden.date <- c()

# 找出有關鍵字的標題並分類
harden1 <- grep("Harden", title)
harden2 <- grep("哈登", title)

Harden <- c(Harden,title[harden1])
Harden <- c(Harden,title[harden2])

Harden.url <- c(Harden.url, data[harden1])
Harden.url <- c(Harden.url, data[harden2])

Harden.date <- c(Harden.date, date[harden1])
Harden.date <- c(Harden.date, date[harden2])

4. 留言擷取分類文本清理和建立文本矩陣 TermDocumentMatrix

message <- list()
cc = worker()
LBJTDF <- data.frame()
SCTDF <- data.frame()
JHTDF <- data.frame()
player <- c()
postdate <- c()
hot <- c()

#爬取每篇有姆斯 LBJ 或是James 文章的留言
for(i in c(1:length(Lebron))){
  html <- htmlParse(GET(Lebron.url[i]),encoding = "UTF-8")
  message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
  message <- unlist(message.list)
  
  #日期分類
  player <- c(player,"LBJ")
  if(grepl("3/0",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/01~3/09") }
  else if(grepl("3/1",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/10~3/19")}
  else if(grepl("3/2",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/20~3/29")}
  else if(grepl("3/3",Lebron.date[i])== TRUE){
    postdate <- c(postdate, "3/30~3/31")}
 
  
  if(length(message) > 100){
    hot <- c(hot, "Boom!")}
  else if(length(message) > 75){
    hot <- c(hot, "Hot")}
  else if(length(message) > 50){
    hot <- c(hot, "Soso")}
  else{
    hot <- c(hot, "Cold") }
  
  
  
  #文本清理
  d.corpus <- VCorpus( VectorSource(message) )
  d.corpus <- tm_map(d.corpus, removePunctuation)
  d.corpus <- tm_map(d.corpus, removeNumbers)
  d.corpus <- tm_map(d.corpus, function(word) {
    gsub("[A-Za-z0-9]", "", word)
  })
  
  #斷詞
  abc <- data.frame(table(cc[as.character(d.corpus)]))
  colnames(abc) <- c("word", as.character(i))
  
  #合併
  if(i == 1){
    LBJTDF <- abc}
  else{
    LBJTDF <- merge(LBJTDF, abc, by = "word", all = T)}
}

#爬取每篇有柯瑞 咖哩 或是Curry 文章的留言
for(i in c(1:length(Curry))){
  html <- htmlParse(GET(Curry.url[i]),encoding = "UTF-8")
  message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
  message <- unlist(message.list)
  
  #日期分類
  player <- c(player,"Curry")
  if(grepl("3/0",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/01~3/09") }
  else if(grepl("3/1",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/10~3/19")}
  else if(grepl("3/2",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/20~3/29")}
  else if(grepl("3/3",Curry.date[i])== TRUE){
    postdate <- c(postdate, "3/30~3/31")}

 
  #人氣分類
  if(length(message) > 100){
    hot <- c(hot, "Boom!")}
  else if(length(message) > 75){
    hot <- c(hot, "Hot")}
  else if(length(message) > 50){
    hot <- c(hot, "Soso")}
  else{
    hot <- c(hot, "Cold") }
  
   #文本清理
  d.corpus <- VCorpus( VectorSource(message) )
  d.corpus <- tm_map(d.corpus, removePunctuation)
  d.corpus <- tm_map(d.corpus, removeNumbers)
  d.corpus <- tm_map(d.corpus, function(word) {
    gsub("[A-Za-z0-9]", "", word)
  })
  
  #斷詞
  abc <- data.frame(table(cc[as.character(d.corpus)]))
  colnames(abc) <- c("word", as.character(i))
  
  #合併
  if(i == 1){
    SCTDF <- abc}
  else{
    SCTDF <- merge(SCTDF, abc, by = "word", all = T)}
}

#爬取每篇有哈登或是Harden文章的留言
for(i in c(1:length(Harden))){
  html <- htmlParse(GET(Harden.url[i]),encoding = "UTF-8")
  message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
  message <- unlist(message.list)
  
  #日期分類
  player <- c(player,"Harden")
  if(grepl("3/0",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/01~3/09") }
  else if(grepl("3/1",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/10~3/19")}
  else if(grepl("3/2",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/20~3/29")}
  else if(grepl("3/3",Harden.date[i])== TRUE){
    postdate <- c(postdate, "3/30~3/31")}
  
  #人氣分類
  if(length(message) > 100){
    hot <- c(hot, "Boom!")}
  else if(length(message) > 75){
    hot <- c(hot, "Hot")}
  else if(length(message) > 50){
    hot <- c(hot, "Soso")}
  else{
    hot <- c(hot, "Cold") }
  
  #文本清理
  d.corpus <- VCorpus( VectorSource(message) )
  d.corpus <- tm_map(d.corpus, removePunctuation)
  d.corpus <- tm_map(d.corpus, removeNumbers)
  d.corpus <- tm_map(d.corpus, function(word) {
    gsub("[A-Za-z0-9]", "", word)
  })
  
  #斷詞
  abc <- data.frame(table(cc[as.character(d.corpus)]))
  colnames(abc) <- c("word", as.character(i))
  
  #合併
  if(i == 1){
    JHTDF <- abc}
  else{
    JHTDF <- merge(JHTDF, abc, by = "word", all = T)}
}



#將遺漏值設為 0
LBJTDF[is.na(LBJTDF)] <- 0
JHTDF[is.na(JHTDF)] <- 0
SCTDF[is.na(SCTDF)] <- 0

library(knitr)
kable(head(LBJTDF))

word	1	2	3	4	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	70	71	72	73	74	75	76	77	78	79	80	81	82	83	84	85	86	87	88	89	90	91	92	93	94	95	96	97	98	99
阿	2	0	6	6	1	4	0	0	2	16	10	3	1	1	1	0	3	2	3	1	0	0	2	0	1	0	2	7	0	22	2	0	0	0	4	2	4	0	2	4	1	1	1	13	1	1	0	2	0	7	2	0	6	1	0	0	0	2	1	1	0	3	0	0	0	0	5	2	1	0	1	4	4	3	0	2	1	1	3	1	2	1	1	1	0	0	2	0	0	7	0	5	0	2	4	8	0	1
啊	1	7	12	13	1	2	1	1	2	46	40	13	1	2	4	2	5	6	2	2	4	0	15	2	4	3	12	20	0	57	3	0	1	2	1	5	18	4	5	1	3	8	4	15	2	6	1	2	9	4	14	2	14	4	5	0	1	2	0	0	3	0	3	4	1	5	2	8	0	1	5	5	21	6	8	1	2	2	5	6	3	0	5	4	8	0	2	0	2	4	2	5	5	9	6	5	3	11
吧	2	16	7	4	2	5	0	3	0	35	25	7	9	1	5	2	6	3	3	3	5	1	18	1	1	0	8	7	1	19	5	0	2	1	3	5	5	1	7	5	1	3	5	6	4	3	0	8	2	8	22	0	17	2	6	1	0	1	0	2	0	5	6	1	1	8	4	2	3	1	3	6	9	9	6	1	5	4	4	4	11	4	2	0	2	2	0	2	0	9	0	5	7	7	7	13	9	7
白鬍子	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
辦法	1	0	1	2	0	0	0	0	0	4	3	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	1	0	3	0	0	0	0	0	1	0	0	0	1	0	0	0	3	0	0	0	0	0	1	1	0	1	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	1	0	0	0	0	0	0	1	0	1	0	1	2	1
被	3	2	20	8	0	4	5	6	5	39	22	14	3	4	3	0	2	8	6	1	8	0	13	1	2	2	2	19	1	28	0	1	5	2	1	11	19	2	2	0	4	3	3	17	3	3	1	3	5	10	9	0	19	8	14	0	0	1	0	0	4	5	4	4	1	4	2	7	1	4	0	2	12	7	7	4	3	1	1	14	3	3	10	2	8	0	3	1	0	29	0	22	2	2	5	5	6	9

kable(head(SCTDF))

word	1	2	3	4	5	6	7	8	9	10	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	31	32	33	34	36	37	38	39	40	41	42	43	44	45	47	48	50
阿	1	8	0	1	0	0	0	1	1	2	2	3	8	1	0	1	0	1	0	1	1	16	0	0	0	0	0	0	0	3	0	0	0	0	1	6	1	0	1	5	2	2	0	1	0
啊	4	37	2	4	0	3	1	2	2	2	3	8	16	2	2	4	5	1	1	10	10	18	3	1	1	4	2	3	2	4	2	1	2	1	4	14	4	3	1	5	3	1	1	4	0
欸	1	9	0	1	0	0	2	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	1	1	0	1	1	0	0	0	0	1	1	0	0	0	1	0	0	1	0	0
暗號	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
吧	1	23	1	2	1	9	9	0	3	2	2	14	24	2	3	6	5	5	0	5	3	19	3	3	0	9	4	1	0	8	3	6	0	1	1	20	14	6	9	8	3	3	1	1	1
白癡	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

kable(head(JHTDF))

word	1	2	3	4	7	8	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	29	30	33	34	35	37	38	39
阿	1	0	0	2	1	2	0	0	1	0	0	1	7	1	0	3	3	2	6	0	0	0	2	0	0	0	0	0	8	1	5	1
啊	1	3	1	1	1	4	1	4	6	1	1	8	13	8	5	11	8	4	11	1	3	5	2	4	2	1	1	1	10	9	6	2
拔	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
把	2	1	0	0	0	1	2	0	1	0	0	1	1	2	0	2	1	0	4	0	3	3	1	0	0	1	2	1	3	4	2	0
爸	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
爸爸	4	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

5. 將已建好的 TDM 轉成 TF-IDF

n <- length(Lebron)
tf1 <- apply(as.matrix(LBJTDF[,2:(n+1)]), 2, sum)
library(Matrix)
idfCal1 <- function(word_doc)
{ 
  log2( n / nnzero(word_doc) ) 
}
idf1 <- apply(as.matrix(LBJTDF[,2:(n+1)]), 1, idfCal1)

doc1.tfidf <- LBJTDF

for(x in 1:nrow(LBJTDF))
{
  for(y in 2:ncol(LBJTDF))
  {
    doc1.tfidf[x,y] <- (doc1.tfidf[x,y] / tf1[y-1]) * idf1[x]
  }
}

n <- length(Curry)
tf2 <- apply(as.matrix(SCTDF[,2:(n+1)]), 2, sum)

idfCal2 <- function(word_doc)
{ 
  log2( n / nnzero(word_doc) ) 
}
idf2 <- apply(as.matrix(SCTDF[,2:(n+1)]), 1, idfCal2)

doc2.tfidf <- SCTDF

for(x in 1:nrow(SCTDF))
{
  for(y in 2:ncol(SCTDF))
  {
    doc2.tfidf[x,y] <- (doc2.tfidf[x,y] / tf2[y-1]) * idf2[x]
  }
}

n <- length(Harden)
tf3 <- apply(as.matrix(JHTDF[,2:(n+1)]), 2, sum)
idfCal3 <- function(word_doc)
{ 
  log2( n / nnzero(word_doc) ) 
}
idf3 <- apply(as.matrix(JHTDF[,2:(n+1)]), 1, idfCal3)

doc3.tfidf <- JHTDF

for(x in 1:nrow(JHTDF))
{
  for(y in 2:ncol(JHTDF))
  {
    doc3.tfidf[x,y] <- (doc3.tfidf[x,y] / tf3[y-1]) * idf3[x]
  }
}

6.列出文章留言中的最重要詞彙

6-1 Lebron James 文章重要詞彙

topwords <- subset(head(doc1.tfidf[order(doc1.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc1.tfidf))){
  topwords <- cbind(topwords, head(doc1.tfidf[order(doc1.tfidf[i], decreasing = TRUE),])[1])
}


AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]

kable(head(AllTop))

	Var1	Freq
389	曇花	7
20	保養	5
396	體能	5
80	得分	4
335	人品	4
402	統治力	4

6-2 Stephen Curry 文章重要詞彙

topwords <- subset(head(doc2.tfidf[order(doc2.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc2.tfidf))){
  topwords <- cbind(topwords, head(doc2.tfidf[order(doc2.tfidf[i], decreasing = TRUE),])[1])
}

AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]

kable(head(AllTop))

	Var1	Freq
21	笨拙	5
17	報應	3
31	不讓	3
32	不是故意	3
105	火箭	3
257	早日康復	3

6-3 James Harden 文章重要詞彙

topwords <- subset(head(doc3.tfidf[order(doc3.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc3.tfidf))){
  topwords <- cbind(topwords, head(doc3.tfidf[order(doc3.tfidf[i], decreasing = TRUE),])[1])
}

AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]

kable(head(AllTop))

	Var1	Freq
53	犯規	4
200	戰績	3
211	走步	3
2	阿泰	2
36	單打	2
42	地雷	2

7. 發文日期和發文量

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

df1 <- data.frame(player,postdate,hot)

names(df1) <- c("Player", "Date", "Popularity")
ggplot(df1, aes(Date,fill = Player)) + geom_bar(position="dodge")

8. 球員被討論的熱門程度

ggplot(df1, aes(Player,fill = Popularity)) + geom_bar(position="dodge")

9. 結論

這次TFIDF的分析讓我看到 Lebron Harden Curry這三位現今NBA頂級球員在台灣最大的論壇ptt中鄉民們最常以甚麼詞語來評論他們也可以從這些詞語中看出最近他們身處什麼議題和最近的表現如何

像是Lebron的體能統治力得分可知道即使現在他已經33歲了仍然是NBA的賽場上的霸主

而Curry的早日康復知道最近Curry受到傷勢困擾也造成他在3月底的關注度減少(3/8 3/22為其受傷的日子故文章較多)

最後 Harden的犯規戰績走步也很符合他一貫的形象也知道今年火箭的戰績(西區第一)廣大被鄉民討論

而在爬蟲的過程中我也將有關球員文章的留言數知道這三位好手的人氣

儘管他們三人都非常紅但很明顯 Lebron James仍然是目前NBA的第一人(從發文數跟留言量都可知)

硬要說 TFIDF 有什麼缺點可能是在當資料過多(像此次處理一個月的文章)時繁複的計算會導致時間偏慢但其演算法去除贅字的能力仍讓我大開眼界希望將來能有更多的時間對這個演算法有更深入的研究！

TF-IDF 文字探勘 — 以NBA Ptt鄉民回文為例

Josh Chang

2018/4/5

主題介紹

1.載入套件包

2.爬下所需的Ptt網站網址和標題

遇到的小問題(已解決)

3.找出有相對應關鍵字的標題

4. 留言擷取分類文本清理和建立文本矩陣 TermDocumentMatrix

5. 將已建好的 TDM 轉成 TF-IDF

6.列出文章留言中的最重要詞彙

6-1 Lebron James 文章重要詞彙

6-2 Stephen Curry 文章重要詞彙

6-3 James Harden 文章重要詞彙

7. 發文日期和發文量

8. 球員被討論的熱門程度

9. 結論

TF-IDF 文字探勘 — 以NBA Ptt鄉民回文為例

Josh Chang

2018/4/5

主題介紹

1.載入套件包

2.爬下所需的Ptt網站網址和標題

遇到的小問題(已解決)

3.找出有相對應關鍵字的標題

4. 留言擷取 分類 文本清理和建立文本矩陣 TermDocumentMatrix

5. 將已建好的 TDM 轉成 TF-IDF

6.列出文章留言中的最重要詞彙

6-1 Lebron James 文章重要詞彙

6-2 Stephen Curry 文章重要詞彙

6-3 James Harden 文章重要詞彙

7. 發文日期和發文量

8. 球員被討論的熱門程度

9. 結論

4. 留言擷取分類文本清理和建立文本矩陣 TermDocumentMatrix