TF-IDF 為一種文字探勘處理的工具,使用 TF 詞頻 (Term Frequency)和 IDF 逆向文件頻率 (Inverse Document Frequent)的乘積,得出權重(Weight),讓使用的人可以在多個文件中找到頻詞高,且在各個文件中“較為獨特”的文字。
而在這次的練習中,我參考了助教給的網站的概念,觀察ptt鄉民對於今年三位MVP呼聲最高球員,Curry, Lebron, Harden,
看看他們分別被討論的關注度和時間點,以及鄉民們對他們有著什麼樣的回應和評論,那就讓我們繼續往下看吧~~
library(bitops)
library(httr)
library(RCurl)
library(XML)
library(NLP)
library(tm)
library(tmcn)
library(jiebaRD)
library(jiebaR)
library(dplyr)
data <- list()
title <- list()
date <- list()
#抓取三月ptt NBA版的貼文
for( i in c(5702:5794)){
tmp <- paste(i, '.html', sep='')
url <- paste('www.ptt.cc/bbs/NBA/index', tmp, sep='')
html <- htmlParse(GET(url),encoding = "UTF-8")
title.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlValue)
url.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlAttrs)
date.list <- xpathSApply(html, "//div[@class='meta']/div[@class='date']", xmlValue)
data <- rbind(data, as.matrix(paste('www.ptt.cc', url.list, sep='')))
title <- rbind(title, as.matrix(title.list))
date <- rbind(date, as.matrix(date.list))
}
# data 存網址 title 存標題 #date 存時間
data <- unlist(data)
title <- unlist(title)
date <- unlist(date)
原本用getURL 結果發現url.list裡面是NULL 改用GET後就正確了
Lebron <- c()
Lebron.url <- c()
Lebron.date <- c()
# 找出有關鍵字的標題並分類
lebron1 <- grep("James", title)
lebron2 <- grep("LBJ", title)
lebron3 <- grep("姆斯", title)
Lebron <- c(Lebron,title[lebron1])
Lebron <- c(Lebron,title[lebron2])
Lebron <- c(Lebron,title[lebron3])
Lebron.url <- c(Lebron.url, data[lebron1])
Lebron.url <- c(Lebron.url, data[lebron2])
Lebron.url <- c(Lebron.url, data[lebron3])
Lebron.date <- c(Lebron.date, date[lebron1])
Lebron.date <- c(Lebron.date, date[lebron2])
Lebron.date <- c(Lebron.date, date[lebron3])
Curry <- c()
Curry.url <- c()
Curry.date <- c()
# 找出有關鍵字的標題並分類
curry1 <- grep("Curry", title)
curry2 <- grep("柯瑞", title)
curry3 <- grep("咖哩", title)
Curry <- c(Curry,title[curry1])
Curry <- c(Curry,title[curry2])
Curry <- c(Curry,title[curry3])
Curry.url <- c(Curry.url, data[curry1])
Curry.url <- c(Curry.url, data[curry2])
Curry.url <- c(Curry.url, data[curry3])
Curry.date <- c(Curry.date, date[curry1])
Curry.date <- c(Curry.date, date[curry2])
Curry.date <- c(Curry.date, date[curry3])
Harden <- c()
Harden.url <- c()
Harden.date <- c()
# 找出有關鍵字的標題並分類
harden1 <- grep("Harden", title)
harden2 <- grep("哈登", title)
Harden <- c(Harden,title[harden1])
Harden <- c(Harden,title[harden2])
Harden.url <- c(Harden.url, data[harden1])
Harden.url <- c(Harden.url, data[harden2])
Harden.date <- c(Harden.date, date[harden1])
Harden.date <- c(Harden.date, date[harden2])
message <- list()
cc = worker()
LBJTDF <- data.frame()
SCTDF <- data.frame()
JHTDF <- data.frame()
player <- c()
postdate <- c()
hot <- c()
#爬取每篇有姆斯 LBJ 或是James 文章的留言
for(i in c(1:length(Lebron))){
html <- htmlParse(GET(Lebron.url[i]),encoding = "UTF-8")
message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
message <- unlist(message.list)
#日期分類
player <- c(player,"LBJ")
if(grepl("3/0",Lebron.date[i])== TRUE){
postdate <- c(postdate, "3/01~3/09") }
else if(grepl("3/1",Lebron.date[i])== TRUE){
postdate <- c(postdate, "3/10~3/19")}
else if(grepl("3/2",Lebron.date[i])== TRUE){
postdate <- c(postdate, "3/20~3/29")}
else if(grepl("3/3",Lebron.date[i])== TRUE){
postdate <- c(postdate, "3/30~3/31")}
if(length(message) > 100){
hot <- c(hot, "Boom!")}
else if(length(message) > 75){
hot <- c(hot, "Hot")}
else if(length(message) > 50){
hot <- c(hot, "Soso")}
else{
hot <- c(hot, "Cold") }
#文本清理
d.corpus <- VCorpus( VectorSource(message) )
d.corpus <- tm_map(d.corpus, removePunctuation)
d.corpus <- tm_map(d.corpus, removeNumbers)
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
#斷詞
abc <- data.frame(table(cc[as.character(d.corpus)]))
colnames(abc) <- c("word", as.character(i))
#合併
if(i == 1){
LBJTDF <- abc}
else{
LBJTDF <- merge(LBJTDF, abc, by = "word", all = T)}
}
#爬取每篇有柯瑞 咖哩 或是Curry 文章的留言
for(i in c(1:length(Curry))){
html <- htmlParse(GET(Curry.url[i]),encoding = "UTF-8")
message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
message <- unlist(message.list)
#日期分類
player <- c(player,"Curry")
if(grepl("3/0",Curry.date[i])== TRUE){
postdate <- c(postdate, "3/01~3/09") }
else if(grepl("3/1",Curry.date[i])== TRUE){
postdate <- c(postdate, "3/10~3/19")}
else if(grepl("3/2",Curry.date[i])== TRUE){
postdate <- c(postdate, "3/20~3/29")}
else if(grepl("3/3",Curry.date[i])== TRUE){
postdate <- c(postdate, "3/30~3/31")}
#人氣分類
if(length(message) > 100){
hot <- c(hot, "Boom!")}
else if(length(message) > 75){
hot <- c(hot, "Hot")}
else if(length(message) > 50){
hot <- c(hot, "Soso")}
else{
hot <- c(hot, "Cold") }
#文本清理
d.corpus <- VCorpus( VectorSource(message) )
d.corpus <- tm_map(d.corpus, removePunctuation)
d.corpus <- tm_map(d.corpus, removeNumbers)
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
#斷詞
abc <- data.frame(table(cc[as.character(d.corpus)]))
colnames(abc) <- c("word", as.character(i))
#合併
if(i == 1){
SCTDF <- abc}
else{
SCTDF <- merge(SCTDF, abc, by = "word", all = T)}
}
#爬取每篇有哈登或是Harden文章的留言
for(i in c(1:length(Harden))){
html <- htmlParse(GET(Harden.url[i]),encoding = "UTF-8")
message.list <- xpathSApply(html, "//div[@class='push']/span[@class='f3 push-content']", xmlValue)
message <- unlist(message.list)
#日期分類
player <- c(player,"Harden")
if(grepl("3/0",Harden.date[i])== TRUE){
postdate <- c(postdate, "3/01~3/09") }
else if(grepl("3/1",Harden.date[i])== TRUE){
postdate <- c(postdate, "3/10~3/19")}
else if(grepl("3/2",Harden.date[i])== TRUE){
postdate <- c(postdate, "3/20~3/29")}
else if(grepl("3/3",Harden.date[i])== TRUE){
postdate <- c(postdate, "3/30~3/31")}
#人氣分類
if(length(message) > 100){
hot <- c(hot, "Boom!")}
else if(length(message) > 75){
hot <- c(hot, "Hot")}
else if(length(message) > 50){
hot <- c(hot, "Soso")}
else{
hot <- c(hot, "Cold") }
#文本清理
d.corpus <- VCorpus( VectorSource(message) )
d.corpus <- tm_map(d.corpus, removePunctuation)
d.corpus <- tm_map(d.corpus, removeNumbers)
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
#斷詞
abc <- data.frame(table(cc[as.character(d.corpus)]))
colnames(abc) <- c("word", as.character(i))
#合併
if(i == 1){
JHTDF <- abc}
else{
JHTDF <- merge(JHTDF, abc, by = "word", all = T)}
}
#將遺漏值設為 0
LBJTDF[is.na(LBJTDF)] <- 0
JHTDF[is.na(JHTDF)] <- 0
SCTDF[is.na(SCTDF)] <- 0
library(knitr)
kable(head(LBJTDF))
| word | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 阿 | 2 | 0 | 6 | 6 | 0 | 1 | 4 | 0 | 0 | 2 | 16 | 10 | 3 | 1 | 1 | 1 | 0 | 3 | 2 | 3 | 1 | 0 | 0 | 2 | 0 | 1 | 0 | 2 | 7 | 0 | 22 | 2 | 0 | 0 | 0 | 4 | 2 | 4 | 0 | 2 | 4 | 1 | 1 | 1 | 13 | 1 | 1 | 0 | 2 | 0 | 7 | 2 | 0 | 6 | 1 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 5 | 2 | 1 | 0 | 1 | 4 | 4 | 3 | 0 | 2 | 1 | 1 | 3 | 1 | 2 | 1 | 1 | 1 | 0 | 0 | 2 | 0 | 0 | 7 | 0 | 5 | 0 | 2 | 4 | 8 | 0 | 1 |
| 啊 | 1 | 7 | 12 | 13 | 0 | 1 | 2 | 1 | 1 | 2 | 46 | 40 | 13 | 1 | 2 | 4 | 2 | 5 | 6 | 2 | 2 | 4 | 0 | 15 | 2 | 4 | 3 | 12 | 20 | 0 | 57 | 3 | 0 | 1 | 2 | 1 | 5 | 18 | 4 | 5 | 1 | 3 | 8 | 4 | 15 | 2 | 6 | 1 | 2 | 9 | 4 | 14 | 2 | 14 | 4 | 5 | 0 | 1 | 2 | 0 | 0 | 3 | 0 | 3 | 4 | 1 | 5 | 2 | 8 | 0 | 1 | 5 | 5 | 21 | 6 | 8 | 1 | 2 | 2 | 5 | 6 | 3 | 0 | 5 | 4 | 8 | 0 | 2 | 0 | 2 | 4 | 2 | 5 | 5 | 9 | 6 | 5 | 3 | 11 |
| 吧 | 2 | 16 | 7 | 4 | 0 | 2 | 5 | 0 | 3 | 0 | 35 | 25 | 7 | 9 | 1 | 5 | 2 | 6 | 3 | 3 | 3 | 5 | 1 | 18 | 1 | 1 | 0 | 8 | 7 | 1 | 19 | 5 | 0 | 2 | 1 | 3 | 5 | 5 | 1 | 7 | 5 | 1 | 3 | 5 | 6 | 4 | 3 | 0 | 8 | 2 | 8 | 22 | 0 | 17 | 2 | 6 | 1 | 0 | 1 | 0 | 2 | 0 | 5 | 6 | 1 | 1 | 8 | 4 | 2 | 3 | 1 | 3 | 6 | 9 | 9 | 6 | 1 | 5 | 4 | 4 | 4 | 11 | 4 | 2 | 0 | 2 | 2 | 0 | 2 | 0 | 9 | 0 | 5 | 7 | 7 | 7 | 13 | 9 | 7 |
| 白鬍子 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 辦法 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 2 | 1 |
| 被 | 3 | 2 | 20 | 8 | 0 | 0 | 4 | 5 | 6 | 5 | 39 | 22 | 14 | 3 | 4 | 3 | 0 | 2 | 8 | 6 | 1 | 8 | 0 | 13 | 1 | 2 | 2 | 2 | 19 | 1 | 28 | 0 | 1 | 5 | 2 | 1 | 11 | 19 | 2 | 2 | 0 | 4 | 3 | 3 | 17 | 3 | 3 | 1 | 3 | 5 | 10 | 9 | 0 | 19 | 8 | 14 | 0 | 0 | 1 | 0 | 0 | 4 | 5 | 4 | 4 | 1 | 4 | 2 | 7 | 1 | 4 | 0 | 2 | 12 | 7 | 7 | 4 | 3 | 1 | 1 | 14 | 3 | 3 | 10 | 2 | 8 | 0 | 3 | 1 | 0 | 29 | 0 | 22 | 2 | 2 | 5 | 5 | 6 | 9 |
kable(head(SCTDF))
| word | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 阿 | 1 | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 2 | 3 | 8 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 0 | 1 | 5 | 2 | 2 | 0 | 0 | 1 | 0 | 0 |
| 啊 | 4 | 37 | 2 | 4 | 0 | 3 | 1 | 2 | 2 | 2 | 0 | 3 | 8 | 16 | 2 | 2 | 4 | 5 | 1 | 1 | 10 | 10 | 18 | 3 | 1 | 1 | 4 | 2 | 3 | 0 | 2 | 4 | 2 | 1 | 0 | 2 | 1 | 4 | 14 | 4 | 3 | 1 | 5 | 3 | 1 | 0 | 1 | 4 | 0 | 0 |
| 欸 | 1 | 9 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 暗號 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 吧 | 1 | 23 | 1 | 2 | 1 | 9 | 9 | 0 | 3 | 2 | 0 | 2 | 14 | 24 | 2 | 3 | 6 | 5 | 5 | 0 | 5 | 3 | 19 | 3 | 3 | 0 | 9 | 4 | 1 | 0 | 0 | 8 | 3 | 6 | 0 | 0 | 1 | 1 | 20 | 14 | 6 | 9 | 8 | 3 | 3 | 0 | 1 | 1 | 0 | 1 |
| 白癡 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
kable(head(JHTDF))
| word | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 阿 | 1 | 0 | 0 | 2 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 7 | 1 | 0 | 3 | 3 | 2 | 6 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 0 | 1 | 5 | 1 |
| 啊 | 1 | 3 | 1 | 1 | 0 | 0 | 1 | 4 | 0 | 1 | 4 | 6 | 1 | 1 | 8 | 13 | 8 | 5 | 11 | 8 | 4 | 11 | 1 | 3 | 5 | 2 | 4 | 0 | 2 | 1 | 0 | 0 | 1 | 1 | 10 | 0 | 9 | 6 | 2 |
| 拔 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 把 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 1 | 0 | 0 | 1 | 1 | 2 | 0 | 2 | 1 | 0 | 4 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 1 | 3 | 0 | 4 | 2 | 0 |
| 爸 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 爸爸 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
n <- length(Lebron)
tf1 <- apply(as.matrix(LBJTDF[,2:(n+1)]), 2, sum)
library(Matrix)
idfCal1 <- function(word_doc)
{
log2( n / nnzero(word_doc) )
}
idf1 <- apply(as.matrix(LBJTDF[,2:(n+1)]), 1, idfCal1)
doc1.tfidf <- LBJTDF
for(x in 1:nrow(LBJTDF))
{
for(y in 2:ncol(LBJTDF))
{
doc1.tfidf[x,y] <- (doc1.tfidf[x,y] / tf1[y-1]) * idf1[x]
}
}
n <- length(Curry)
tf2 <- apply(as.matrix(SCTDF[,2:(n+1)]), 2, sum)
idfCal2 <- function(word_doc)
{
log2( n / nnzero(word_doc) )
}
idf2 <- apply(as.matrix(SCTDF[,2:(n+1)]), 1, idfCal2)
doc2.tfidf <- SCTDF
for(x in 1:nrow(SCTDF))
{
for(y in 2:ncol(SCTDF))
{
doc2.tfidf[x,y] <- (doc2.tfidf[x,y] / tf2[y-1]) * idf2[x]
}
}
n <- length(Harden)
tf3 <- apply(as.matrix(JHTDF[,2:(n+1)]), 2, sum)
idfCal3 <- function(word_doc)
{
log2( n / nnzero(word_doc) )
}
idf3 <- apply(as.matrix(JHTDF[,2:(n+1)]), 1, idfCal3)
doc3.tfidf <- JHTDF
for(x in 1:nrow(JHTDF))
{
for(y in 2:ncol(JHTDF))
{
doc3.tfidf[x,y] <- (doc3.tfidf[x,y] / tf3[y-1]) * idf3[x]
}
}
topwords <- subset(head(doc1.tfidf[order(doc1.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc1.tfidf))){
topwords <- cbind(topwords, head(doc1.tfidf[order(doc1.tfidf[i], decreasing = TRUE),])[1])
}
AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]
kable(head(AllTop))
| Var1 | Freq | |
|---|---|---|
| 389 | 曇花 | 7 |
| 20 | 保養 | 5 |
| 396 | 體能 | 5 |
| 80 | 得分 | 4 |
| 335 | 人品 | 4 |
| 402 | 統治力 | 4 |
topwords <- subset(head(doc2.tfidf[order(doc2.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc2.tfidf))){
topwords <- cbind(topwords, head(doc2.tfidf[order(doc2.tfidf[i], decreasing = TRUE),])[1])
}
AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]
kable(head(AllTop))
| Var1 | Freq | |
|---|---|---|
| 21 | 笨拙 | 5 |
| 17 | 報應 | 3 |
| 31 | 不讓 | 3 |
| 32 | 不是故意 | 3 |
| 105 | 火箭 | 3 |
| 257 | 早日康復 | 3 |
topwords <- subset(head(doc3.tfidf[order(doc3.tfidf[2], decreasing = TRUE), ]), select = c(word))
for (i in c(3:ncol(doc3.tfidf))){
topwords <- cbind(topwords, head(doc3.tfidf[order(doc3.tfidf[i], decreasing = TRUE),])[1])
}
AllTop = as.data.frame( table(as.matrix(topwords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]
kable(head(AllTop))
| Var1 | Freq | |
|---|---|---|
| 53 | 犯規 | 4 |
| 200 | 戰績 | 3 |
| 211 | 走步 | 3 |
| 2 | 阿泰 | 2 |
| 36 | 單打 | 2 |
| 42 | 地雷 | 2 |
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
df1 <- data.frame(player,postdate,hot)
names(df1) <- c("Player", "Date", "Popularity")
ggplot(df1, aes(Date,fill = Player)) + geom_bar(position="dodge")
ggplot(df1, aes(Player,fill = Popularity)) + geom_bar(position="dodge")
這次TFIDF的分析 讓我看到 Lebron Harden Curry這三位現今NBA頂級球員 在台灣最大的論壇ptt中 鄉民們最常以甚麼詞語來評論他們 也可以從這些詞語中看出 最近他們身處什麼議題 和最近的表現如何
像是Lebron的體能 統治力 得分 可知道即使現在他已經33歲了 仍然是NBA的賽場上的霸主
而Curry的早日康復 知道最近Curry受到傷勢困擾 也造成他在3月底的關注度減少(3/8 3/22為其受傷的日子 故文章較多)
最後 Harden的犯規 戰績 走步 也很符合他一貫的形象 也知道今年火箭的戰績(西區第一)廣大被鄉民討論
而在爬蟲的過程中 我也將有關球員文章的留言數 知道這三位好手的人氣
儘管他們三人都非常紅 但很明顯 Lebron James仍然是目前NBA的第一人(從發文數跟留言量都可知)
硬要說 TFIDF 有什麼缺點 可能是在當資料過多(像此次處理一個月的文章)時 繁複的計算會導致時間偏慢 但其演算法去除贅字的能力 仍讓我大開眼界 希望將來能有更多的時間 對這個演算法有更深入的研究!