s = "线程是程序执行时的最小单位,它是进程的一个执行流,\ 是CPU调度和分派的基本单位,一个进程可以由很多个线程组成,\ 线程间共享进程的所有资源,每个线程有自己的堆栈和局部变量。\ 线程由CPU独立调度执行,在多CPU环境下就允许多个线程同时运行。\ 同样多线程也可以实现并发操作,每个请求分配一个线程来处理。" print(s) def word_one(text): return dict([(word,True) for word in text ]) print('单词分词',word_one(s)) import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures def word_two(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bigram_finder = BigramCollocationFinder.from_words(words) # 把文本变成双词搭配的形式 bigrams = bigram_finder.nbest(score_fn, n) # 使用卡方统计的方法,选择排名前1000的双词 newBigrams = [u + v for (u, v) in bigrams] return word_one(newBigrams) print('两词分词',word_two(s, score_fn=BigramAssocMeasures.chi_sq, n=1000)) def word_total(words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) newBigrams = [u + v for (u, v) in bigrams] a = word_one(words) b = word_one(newBigrams) a.update(b) # 把字典b合并到字典a中 return a print('综合分词',word_total(s, score_fn=BigramAssocMeasures.chi_sq, n=1000)) import jieba def wold_cut(text): fenci=jieba.lcut(text) return fenci print('jiaba分词',wold_cut(s))