Subset corpus

require(quanteda)

corpus_subset() allows you to select documents in a corpus based on document-level variables.

corp <- data_corpus_inaugural
ndoc(corp)
## [1] 58
head(docvars(corp))
##                 Year  President FirstName
## 1789-Washington 1789 Washington    George
## 1793-Washington 1793 Washington    George
## 1797-Adams      1797      Adams      John
## 1801-Jefferson  1801  Jefferson    Thomas
## 1805-Jefferson  1805  Jefferson    Thomas
## 1809-Madison    1809    Madison     James
recent_corp <- corpus_subset(corp, Year >= 1990)
ndoc(recent_corp)
## [1] 7
dem_corp <- corpus_subset(corp, President %in% c('Obama', 'Clinton', 'Carter'))
ndoc(dem_corp)
## [1] 5