这篇教程python文本处理的方案(结巴分词并去除符号)写得很实用,希望能帮到您。 看代码吧~import reimport jieba.analyseimport codecsimport pandas as pddef simplification_text(xianbingshi): """提取文本""" xianbingshi_simplification = [] with codecs.open(xianbingshi,'r','utf8') as f: for line in f : line = line.strip() line_write = re.findall('(?<=/<b/>).*?(?=/<e/>)',line) for line in line_write: xianbingshi_simplification.append(line) with codecs.open(r'C:/Users/Administrator.SC-201812211013/PycharmProjects/untitled29/yiwoqu/code/xianbingshi_write.txt','w','utf8') as f: for line in xianbingshi_simplification: f.write(line + '/n')def jieba_text(): """""" word_list = [] data = open(r"C:/Users/Administrator.SC-201812211013/PycharmProjects/untitled29/xianbingshi_write.txt", encoding='utf-8').read() seg_list = jieba.cut(data, cut_all=False) # 精确模式 for i in seg_list: word_list.append(i.strip()) data_quchong = pd.DataFrame({'a':word_list}) data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True) word_list = data_quchong['a'].tolist() with codecs.open('word.txt','w','utf8')as w: for line in word_list: w.write(line + '/n')def word_messy(word): """词语提炼""" word_sub_list = [] with codecs.open(word,'r','utf8') as f: for line in f: line_sub = re.sub("^[1-9]/d*/./d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?/d+)(/./d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line) word_sub_list.append(line_sub) word_sub_list.sort() with codecs.open('word.txt','w','utf8')as w: for line in word_sub_list: w.write(line.strip("/n") + '/n')if __name__ == '__main__': xianbingshi = r'C:/Users/Administrator.SC-201812211013/PycharmProjects/untitled29/yiwoqu/xianbingshi_sub_sen_all(1).txt' # simplification_text(xianbingshi) # word = r'C:/Users/Administrator.SC-201812211013/PycharmProjects/untitled29/word.txt' simplification_text(xianbingshi) 补充:python 进行结巴分词 并且用re去掉符号 看代码吧~# 把停用词做成字典stopwords = {}fstop = open('stop_words.txt', 'r',encoding='utf-8',errors='ingnore')for eachWord in fstop: stopwords[eachWord.strip()] = eachWord.strip() #停用词典fstop.close()f1=open('all.txt','r',encoding='utf-8',errors='ignore')f2=open('allutf11.txt','w',encoding='utf-8')line=f1.readline()while line: line = line.strip() #去前后的空格 line = re.sub(r"[0-9/s+/./!//_,$%^*()?;;:-【】+/"/']+|[+——!,;:。?、~@#¥%……&*()]+", " ", line) #去标点符号 seg_list=jieba.cut(line,cut_all=False) #结巴分词 outStr="" for word in seg_list: if word not in stopwords: outStr+=word outStr+=" " f2.write(outStr) line=f1.readline()f1.close()f2.close() 
以上为个人经验,希望能给大家一个参考,也希望大家多多支持51zixue.net。 Django操作cookie的实现 python numpy中multiply与*及matul 的区别说明 |