Similarity between Twitter users

require(quanteda)
require(readtext)

Import Tweets from JSON (.json) file. twitter.json is located in data directory of this tutorial package.

twitter_data <- readtext("content/data/twitter.json", source = "twitter")

Construct a corpus of Tweets.

tweet_corp <- corpus(twitter_data)

Construct a DFM removing tags and links.

tweet_dfm <- dfm(tweet_corp,
                 remove_punct = TRUE, remove_url = TRUE,
                 remove = c('*.tt', '*.uk', '*.com', 'rt', '#*', '@*')) %>% 
             dfm_remove(stopwords('en'))

ndoc(tweet_dfm)
## [1] 7504
topfeatures(tweet_dfm)
##          vote conservatives        labour         today         share 
##          1817           929           676           666           647 
##       britain          find        fairer        voting      tomorrow 
##           625           613           571           559           548

Group documents by usernames.

user_dfm <- dfm_group(tweet_dfm, groups = docvars(tweet_dfm, 'screen_name'))
ndoc(user_dfm)
## [1] 5061

Remove rare (less than 10 times) and short (one character) features, and convert count to proportion using dfm_weight().

prop_user_dfm <- user_dfm %>% 
                 dfm_select(min_nchar = 2) %>% 
                 dfm_trim(min_termfreq = 10) %>% 
                 dfm_weight('prop')

Calculate user-user similarity using textstat_dist().

user_dist <- textstat_dist(prop_user_dfm)
user_clust <- hclust(user_dist)
plot(user_clust, labels = FALSE)