Subset corpus

require(quanteda)

corpus_subset() allows you to select documents in a corpus based on document-level variables.

corp <- data_corpus_inaugural
ndoc(corp)
## [1] 59
head(docvars(corp))
##   Year  President FirstName                 Party
## 1 1789 Washington    George                  none
## 2 1793 Washington    George                  none
## 3 1797      Adams      John            Federalist
## 4 1801  Jefferson    Thomas Democratic-Republican
## 5 1805  Jefferson    Thomas Democratic-Republican
## 6 1809    Madison     James Democratic-Republican
corp_recent <- corpus_subset(corp, Year >= 1990)
ndoc(corp_recent)
## [1] 8
corp_dem <- corpus_subset(corp, President %in% c("Obama", "Clinton", "Carter"))
ndoc(corp_dem)
## [1] 5