| |
| import json |
| import paddlenlp |
| import gensim |
| import sklearn |
| from collections import Counter |
| from gensim import corpora, models, similarities |
| import numpy as np |
| import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
|
|
|
| def build_corpus(sentences): |
| |
| vocab = paddlenlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese').vocab |
|
|
| |
| tokenizer = paddlenlp.data.JiebaTokenizer(vocab) |
| |
| stopwords = [""] |
| words_list = [] |
| for sentence in sentences: |
| words = [word for word in tokenizer.cut(sentence) if word not in stopwords] |
| words_list.append(words) |
| |
| |
| words = [word for sentence in words_list for word in sentence] |
|
|
| dictionary = corpora.Dictionary(words_list) |
| corpus = [dictionary.doc2bow(text) for text in words_list] |
|
|
| return corpus,dictionary,words_list |
|
|
| def lda(words_list,sentences,corpus,dictionary,num): |
| lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary, num_topics=num) |
|
|
| topics = lda.print_topics(num_topics=num, num_words=10) |
| |
| |
|
|
| central_sentences = [] |
| for topic in topics: |
| topic_id, topic_words = topic |
| topic_words = [word.split("*")[1].strip('"') for word in topic_words.split("+")] |
| max_score = 0 |
| candidates = [] |
| for sentence, words in zip(sentences, words_list): |
| score = 0 |
| for word in words: |
| if word in topic_words: |
| score += 1 |
| if score > max_score: |
| max_score = score |
| candidates = [sentence] |
| elif score == max_score: |
| candidates.append(sentence) |
| for candidate in candidates: |
| if candidate not in central_sentences: |
| central_sentence = candidate |
| central_sentences.append(central_sentence) |
| break |
|
|
| return central_sentences |
|
|
|
|
| def abstruct_main(sentences,num): |
| corpus,dictionary,words_list = build_corpus(sentences) |
| central_sentences= lda(words_list, sentences, corpus, dictionary,num) |
| return central_sentences |
|
|