require(quanteda)
require(quanteda.corpora)
options(width = 110)
After tokenization, we remove so called “stopwords” using stopwords("en", source = "marimo")
. If you want tokens to comprise only of the English alphabet, you can select them by "^[a-zA-Z]+$"
. You can find more details on stopwords on the website of the stopwords package. Please be very careful when pre-processing or removing tokens since these choices might influence subsequent results.
# reshape corpus to the level of paragraphs
corp_eng <- corpus_reshape(data_corpus_udhr["eng"], to = "paragraphs")
# tokenize corpus and apply pre-processing
toks_eng <- tokens(corp_eng, remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_remove(pattern = stopwords("en", source = "marimo")) %>%
tokens_keep(pattern = "^[a-zA-Z]+$", valuetype = "regex")
print(toks_eng[2], max_ndoc = 1, max_ntoken = -1)
## Tokens consisting of 1 document and 4 docvars.
## eng.2 :
## [1] "Whereas" "recognition" "inherent" "dignity" "equal" "inalienable"
## [7] "rights" "members" "human" "family" "foundation" "freedom"
## [13] "justice" "peace" "world" "Whereas" "disregard" "contempt"
## [19] "human" "rights" "resulted" "barbarous" "acts" "outraged"
## [25] "conscience" "mankind" "advent" "world" "human" "beings"
## [31] "shall" "enjoy" "freedom" "speech" "belief" "freedom"
## [37] "fear" "want" "proclaimed" "highest" "aspiration" "common"
## [43] "people" "Whereas" "essential" "man" "compelled" "recourse"
## [49] "last" "resort" "rebellion" "tyranny" "oppression" "human"
## [55] "rights" "protected" "rule" "law" "Whereas" "essential"
## [61] "promote" "development" "friendly" "relations" "nations" "Whereas"
## [67] "peoples" "United" "Nations" "Charter" "reaffirmed" "faith"
## [73] "fundamental" "human" "rights" "dignity" "worth" "human"
## [79] "person" "equal" "rights" "men" "women" "determined"
## [85] "promote" "social" "progress" "better" "standards" "life"
## [91] "larger" "freedom" "Whereas" "Member" "States" "pledged"
## [97] "achieve" "United" "Nations" "promotion" "universal" "respect"
## [103] "observance" "human" "rights" "fundamental" "freedoms" "Whereas"
## [109] "common" "understanding" "rights" "freedoms" "greatest" "importance"
## [115] "full" "realization" "pledge" "Now" "therefore" "General"
## [121] "Assembly" "Proclaims" "Universal" "Declaration" "Human" "Rights"
## [127] "common" "standard" "achievement" "peoples" "nations" "end"
## [133] "every" "individual" "every" "organ" "society" "keeping"
## [139] "Declaration" "constantly" "mind" "shall" "strive" "teaching"
## [145] "education" "promote" "respect" "rights" "freedoms" "progressive"
## [151] "measures" "national" "international" "secure" "universal" "effective"
## [157] "recognition" "observance" "among" "peoples" "Member" "States"
## [163] "among" "peoples" "territories" "jurisdiction"
# construct a document-feature matrix
dfmat_eng <- dfm(toks_eng)
print(dfmat_eng)
## Document-feature matrix of: 82 documents, 431 features (97.82% sparse) and 4 docvars.
## features
## docs preamble whereas recognition inherent dignity equal inalienable rights members human
## eng.1 1 0 0 0 0 0 0 0 0 0
## eng.2 0 7 2 1 2 2 1 9 1 8
## eng.3 0 0 0 0 0 0 0 0 0 0
## eng.4 0 0 0 0 1 1 0 1 0 1
## eng.5 0 0 0 0 0 0 0 0 0 0
## eng.6 0 0 0 0 0 0 0 1 0 0
## [ reached max_ndoc ... 76 more documents, reached max_nfeat ... 421 more features ]
Pre-processing of German texts is very similar to English texts, but we have to use Unicode character class "^[\\p{script=Latn}]+$"
to include characters with umlauts (ä/ö/ü).
# reshape document to the level of paragraphs
corp_ger <- corpus_reshape(data_corpus_udhr["deu_1996"], to = "paragraphs")
# tokenize corpus and apply pre-processing
toks_ger <- tokens(corp_ger, remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_remove(pattern = stopwords("de", source = "marimo")) %>%
tokens_keep(pattern = "^[\\p{script=Latn}]+$", valuetype = "regex")
print(toks_ger[2], max_ndoc = 1, max_ntoken = -1)
## Tokens consisting of 1 document and 4 docvars.
## deu_1996.2 :
## [1] "Anerkennung" "angeborenen" "gleichen" "unveräußerlichen" "Rechte"
## [6] "Mitglieder" "Gemeinschaft" "Menschen" "Grundlage" "Freiheit"
## [11] "Gerechtigkeit" "Frieden" "Welt" "bildet" "Nichtanerkennung"
## [16] "Verachtung" "Menschenrechte" "Akten" "Barbarei" "geführt"
## [21] "Gewissen" "Menschheit" "Empörung" "erfüllen" "verkündet"
## [26] "worden" "Welt" "Menschen" "Glaubensfreiheit" "Freiheit"
## [31] "Furcht" "Not" "genießen" "höchste" "Streben"
## [36] "Menschen" "gilt" "notwendig" "Menschenrechte" "Herrschaft"
## [41] "Rechtes" "schützen" "Mensch" "gezwungen" "letztes"
## [46] "Mittel" "Aufstand" "Tyrannei" "Unterdrückung" "greifen"
## [51] "notwendig" "Entwicklung" "freundschaftlicher" "Beziehungen" "Nationen"
## [56] "fördern" "Völker" "Vereinten" "Nationen" "Charta"
## [61] "ihren" "Glauben" "grundlegenden" "Menschenrechte" "Wert"
## [66] "menschlichen" "Person" "Gleichberechtigung" "Mann" "Frau"
## [71] "erneut" "bekräftigt" "beschlossen" "sozialen" "Fortschritt"
## [76] "bessere" "Lebensbedingungen" "größerer" "Freiheit" "fördern"
## [81] "Mitgliedstaaten" "verpflichtet" "Zusammenarbeit" "Vereinten" "Nationen"
## [86] "allgemeine" "Achtung" "Einhaltung" "Menschenrechte" "Grundfreiheiten"
## [91] "hinzuwirken" "gemeinsames" "Verständnis" "Rechte" "Freiheiten"
## [96] "größter" "Wichtigkeit" "volle" "Erfüllung" "Verpflichtung"
## [101] "verkündet" "Generalversammlung" "Allgemeine" "Erklärung" "Menschenrechte"
## [106] "Völkern" "Nationen" "erreichende" "gemeinsame" "Ideal"
## [111] "einzelne" "Organe" "Gesellschaft" "Erklärung" "gegenwärtig"
## [116] "halten" "bemühen" "Unterricht" "Erziehung" "Achtung"
## [121] "Rechten" "Freiheiten" "fördern" "fortschreitende" "nationale"
## [126] "internationale" "Maßnahmen" "allgemeine" "tatsächliche" "Anerkennung"
## [131] "Einhaltung" "Bevölkerung" "Mitgliedstaaten" "Bevölkerung" "ihrer"
## [136] "Hoheitsgewalt" "unterstehenden" "Gebiete" "gewährleisten"
# construct document-feature matrix
dfmat_ger <- dfm(toks_ger)
print(dfmat_ger)
## Document-feature matrix of: 82 documents, 496 features (98.18% sparse) and 4 docvars.
## features
## docs präambel anerkennung angeborenen gleichen unveräußerlichen rechte mitglieder gemeinschaft
## deu_1996.1 1 0 0 0 0 0 0 0
## deu_1996.2 0 2 1 1 1 2 1 1
## deu_1996.3 0 0 0 0 0 0 0 0
## deu_1996.4 0 0 0 0 0 0 0 0
## deu_1996.5 0 0 0 0 0 0 0 0
## deu_1996.6 0 0 0 0 0 1 0 0
## features
## docs menschen grundlage
## deu_1996.1 0 0
## deu_1996.2 3 1
## deu_1996.3 0 0
## deu_1996.4 1 0
## deu_1996.5 0 0
## deu_1996.6 0 0
## [ reached max_ndoc ... 76 more documents, reached max_nfeat ... 486 more features ]