require(quanteda)
require(quanteda.corpora)
options(width = 110)
We resort to the Marimo stopwords list (stopwords("zh_cn", source = "marimo")
) and the length of words (min_nchar = 2
) to remove function words. You can keep only Chinese characters with "^\\p{script=Hani}+$"
.
corp <- corpus_reshape(data_corpus_udhr["cmn_hans"], to = "paragraphs")
toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_remove(pattern = stopwords("zh_cn", source = "marimo"), min_nchar = 2) %>%
tokens_keep(pattern = "^\\p{script=Hani}+$", valuetype = 'regex')
print(toks[2], max_ndoc = 1, max_ntok = -1)
## Tokens consisting of 1 document and 4 docvars.
## cmn_hans.2 :
## [1] "鉴于" "人类" "家庭" "成员" "固有" "尊严" "及其" "平等" "不移"
## [10] "权利" "承认" "乃是" "世界" "自由" "正义" "和平" "基础" "鉴于"
## [19] "人权" "无视" "侮蔑" "发展" "野蛮" "暴行" "暴行" "玷污" "人类"
## [28] "良心" "一个" "人人" "享有" "言论" "信仰" "自由" "恐惧" "世界"
## [37] "来临" "宣布" "普通" "人民" "最高" "愿望" "鉴于" "人类" "不致"
## [46] "迫不得已" "暴政" "压迫" "进行" "反叛" "必要" "人权" "法治" "保护"
## [55] "鉴于" "必要" "促进" "各国" "友好" "关系" "发展" "鉴于" "联合"
## [64] "国家" "人民" "联合" "宪章" "重申" "他们" "基本" "人权" "人格"
## [73] "尊严" "价值" "男女平等" "权利" "信念" "决心" "促成" "较大" "自由"
## [82] "中的" "社会" "进步" "生活" "水平" "改善" "鉴于" "会员" "联合"
## [91] "合作" "促进" "人权" "基本" "自由" "普遍" "尊重" "遵行" "鉴于"
## [100] "权利" "自由" "普遍" "了解" "充分" "实现" "具有" "很大" "重要性"
## [109] "现在" "大会" "发布" "世界" "人权" "宣言" "人民" "国家" "努力"
## [118] "实现" "共同" "标准" "以期" "每一个人" "社会" "机构" "经常" "宣言"
## [127] "努力" "教诲" "教育" "促进" "权利" "自由" "尊重" "国家" "国际"
## [136] "渐进" "措施" "权利" "自由" "会员" "本身" "人民" "管辖" "领土"
## [145] "人民" "得到" "普遍" "有效" "承认" "遵行"
# construct document-feature matrix
dfmat <- dfm(toks)
print(dfmat)
## Document-feature matrix of: 82 documents, 424 features (97.72% sparse) and 4 docvars.
## features
## docs 序言 鉴于 人类 家庭 成员 固有 尊严 及其 平等 不移
## cmn_hans.1 1 0 0 0 0 0 0 0 0 0
## cmn_hans.2 0 7 3 1 1 1 2 1 1 1
## cmn_hans.3 0 0 0 0 0 0 0 0 0 0
## cmn_hans.4 0 0 0 0 0 0 1 0 1 0
## cmn_hans.5 0 0 0 0 0 0 0 0 0 0
## cmn_hans.6 0 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 76 more documents, reached max_nfeat ... 414 more features ]