1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| import pandas as pd import os import jieba from gensim import corpora from gensim import models import re from gensim.models.ldamodel import LdaModel
print(1)
qa=pd.read_excel('data/qa.xlsx',names=['qa'], sheet_name="qa" ,header=None,usecols=[0])
keyWords=pd.read_excel('data/qa.xlsx',names=['keywords'],sheet_name="keyWords" ,header=None,usecols=[0]) for k in keyWords.keywords: jieba.add_word(k)
import re def stopwordsPattern(): stopwordsPatternList=[] for i in open('stopWord.txt',encoding='UTF-8').readlines(): stopwordsPatternList.append(re.sub(r"\n","",i)) return stopwordsPatternList def paperCut(intxt,pattern=stopwordsPattern()): aList=jieba.lcut(intxt) for i in aList: if i in pattern: aList.remove(i) return aList
wordList=[] for paper in qa.qa: wordList.append(paperCut(paper,stopwordsPattern()))
wordDict=corpora.Dictionary(wordList) corpus=[wordDict.doc2bow(text) for text in wordList] tfidf_model = models.TfidfModel(corpus) corpus_tfidf=tfidf_model[corpus]
ldamodel = LdaModel(corpus_tfidf,id2word=wordDict,num_topics=4,passes=5,alpha=5,eta=0.1)
modelName="model_lda1" dirPath="./{}/".format(modelName) if not os.path.exists(dirPath): os.mkdir(dirPath) ldamodel.save(dirPath+modelName) print(2)
|