Chris Bail
Duke University
website: https://www.chrisbail.net
github: https://github.com/cbail
Twitter: https://www.twitter.com/chris_bail
duke_web_scrape<- "Class of 2018: Senior Stories of Discovery, Learning and Serving\n\n\t\t\t\t\t\t\t"
grepl("Class", duke_web_scrape)
[1] TRUE
gsub("\t", "", duke_web_scrape)
[1] "Class of 2018: Senior Stories of Discovery, Learning and Serving\n\n"
gsub("\t|\n", "", duke_web_scrape)
[1] "Class of 2018: Senior Stories of Discovery, Learning and Serving"
some_text<-c("This","Professor","is","not","so","great")
some_text[grep("^[P]", some_text)]
[1] "Professor"
text_chunk<-c("[This Professor is not so Great]")
gsub("\","", text_chunk)
text_chunk<-c("[This Professor is not so Great]")
gsub('\\[|\\]',"", text_chunk)
[1] "This Professor is not so Great"
load(url("https://cbail.github.io/Trump_Tweets.Rdata"))
head(trumptweets$text)
[1] "Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world, it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z"
[2] "America is a Nation that believes in the power of redemption. America is a Nation that believes in second chances - and America is a Nation that believes that the best is always yet to come! #PrisonReform https://t.co/Yk5UJUYgHN"
[3] "RT @SteveForbesCEO: .@realDonaldTrump speech on drug costs pays immediate dividends. New @Amgen drug lists at 30% less than expected. Middl…"
[4] "We grieve for the terrible loss of life, and send our support and love to everyone affected by this horrible attack in Texas. To the students, families, teachers and personnel at Santa Fe High School – we are with you in this tragic hour, and we will be with you forever... https://t.co/LtJ0D29Hsv"
[5] "School shooting in Texas. Early reports not looking good. God bless all!"
[6] "Reports are there was indeed at least one FBI representative implanted, for political purposes, into my campaign for president. It took place very early on, and long before the phony Russia Hoax became a “hot” Fake News story. If true - all time biggest political scandal!"
install.packages("tm")
library(tm)
trump_corpus <- Corpus(VectorSource(as.vector(trumptweets$text)))
trump_corpus
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 0
Content: documents: 3196
library(tidytext)
library(dplyr)
tidy_trump_tweets<- trumptweets %>%
select(created_at,text) %>%
unnest_tokens("word", text)
head(tidy_trump_tweets)
# A tibble: 6 x 2
created_at word
<dttm> <chr>
1 2018-05-18 20:41:21 just
2 2018-05-18 20:41:21 met
3 2018-05-18 20:41:21 with
4 2018-05-18 20:41:21 un
5 2018-05-18 20:41:21 secretary
6 2018-05-18 20:41:21 general
tidy_trump_tweets %>%
count(word) %>%
arrange(desc(n))
# A tibble: 8,690 x 2
word n
<chr> <int>
1 the 3671
2 to 2216
3 and 1959
4 of 1606
5 https 1281
6 t.co 1258
7 a 1248
8 in 1213
9 is 1045
10 for 886
# … with 8,680 more rows
In tm
:
trump_corpus <- tm_map(trump_corpus, removeWords, stopwords("english"))
In tidytext
:
data("stop_words")
tidy_trump_tweets<-tidy_trump_tweets %>%
anti_join(stop_words)
tidy_trump_tweets %>%
count(word) %>%
arrange(desc(n))
# A tibble: 8,121 x 2
word n
<chr> <int>
1 https 1281
2 t.co 1258
3 amp 562
4 rt 351
5 people 302
6 news 271
7 president 235
8 fake 234
9 trump 218
10 country 213
# … with 8,111 more rows
In tm
:
trump_corpus <- tm_map(trump_corpus, content_transformer(removePunctuation))
(punctuation marks are removed automatically in tidytext
)
In tm
:
trump_corpus <- tm_map(trump_corpus, content_transformer(removeNumbers))
In tidytext
:
tidy_trump_tweets<-tidy_trump_tweets[-grep("\\b\\d+\\b", tidy_trump_tweets$word),]
In tm
:
trump_corpus <- tm_map(trump_corpus, content_transformer(tolower))
Once again, tidytext
does this for you.
In tm
:
trump_corpus <- tm_map(trump_corpus, content_transformer(stripWhitespace))
In tidytext
:
tidy_trump_tweets$word <- gsub("\\s+","",tidy_trump_tweets$word)
In tm
:
trump_corpus <- tm_map(trump_corpus, content_transformer(stemDocument), language = "english")
In tidytext
:
library(SnowballC)
tidy_trump_tweets<-tidy_trump_tweets %>%
mutate_at("word", funs(wordStem((.), language="en")))
In tm
:
trump_DTM <- DocumentTermMatrix(trump_corpus, control = list(wordLengths = c(2, Inf)))
inspect(trump_DTM[1:5,3:8])
<<DocumentTermMatrix (documents: 5, terms: 6)>>
Non-/sparse entries: 6/24
Sparsity : 80%
Maximal term length: 8
Weighting : term frequency (tf)
Sample :
Terms
Docs around conflict fantast great guterr hard
1 1 1 1 1 1 1
2 0 0 0 0 0 0
3 0 0 0 0 0 0
4 0 0 0 0 0 0
5 0 0 0 0 0 0
In tidytext
:
tidy_trump_DTM<-
tidy_trump_tweets %>%
count(created_at, word) %>%
cast_dtm(created_at, word, n)
1) Pick one of the Amazon review datasets from this link: http://jmcauley.ucsd.edu/data/amazon/
2) Create a tidytext dataset of these reviews
3) Count the top five words in the dataset after removing stop words;
4) Create a Document-Term Matrix