| import tokenizer as text_rank_tokenizer |
| import ranker as text_rank_ranker |
| import numpy as np |
|
|
| stop_words = open("./text_rank/stopwords.txt",'r',encoding="utf-8").read() |
| word_endings = open("./text_rank/word_endings.txt",'r',encoding='utf-8').read() |
| kriyapads = open("./text_rank/minimal_kriyapad.txt",'r',encoding="utf-8").read().split("\n") |
| samyojaks = open("./text_rank/samyojak.txt",'r',encoding="utf-8").read().split("\n") |
| valid_chars = "./text_rank/valid_chars.json" |
|
|
| def get_summary_from_text(text,force_use_purnabiram_model=False): |
| global stop_words, word_endings, kriyapads, samyojaks |
| |
| is_complete_sentence = True |
| |
| purnabiram_count = text.count("।") |
| if not force_use_purnabiram_model: |
| if purnabiram_count*100 < len(text): |
| is_complete_sentence = False |
| else: |
| is_complete_sentence = False |
| |
|
|
| valid_characters = text_rank_tokenizer.get_valid_chars(valid_chars) |
| |
| |
| if not is_complete_sentence: |
| text = text_rank_tokenizer.add_purnabiram(text,kriyapads,samyojaks) |
| |
| |
| |
| |
| sentences = text_rank_tokenizer.get_sentences_as_arr(text) |
| |
|
|
| text = text_rank_tokenizer.remove_useless_characters(text,valid_characters) |
|
|
|
|
| sentences = text_rank_tokenizer.remove_repeating_sentences(sentences) |
| |
| if len(sentences) == 0: |
| return "It is not a valid text. Please try again with a valid text." |
| elif len(sentences) == 1: |
| return sentences |
| |
| |
| words_arr = text_rank_tokenizer.get_words_as_arr(sentences) |
| |
| |
| |
| words_arr = text_rank_tokenizer.remove_stop_words_and_filter_word_arr(words_arr,word_endings, stop_words) |
| |
| |
| |
| |
| |
| sentences, words_arr = text_rank_tokenizer.remove_empty_sentences(sentences, words_arr) |
| |
| |
| |
| tokens, token_dict = text_rank_tokenizer.tokenize(words_arr) |
| |
| |
| |
| association_matrix, counter_vector = text_rank_ranker.create_association_matrix(tokens,No_of_unique_chars= len(token_dict)) |
| |
| |
| |
| word_influence_vector = text_rank_ranker.calculate_word_ranks(association_matrix, counter_vector) |
| |
| |
| |
| sentence_influence = text_rank_ranker.calculate_sentence_influence(tokens,word_influence_vector) |
| |
| |
| |
| |
| |
| |
| summary_sentences = text_rank_ranker.get_n_influencial_sentence(sentences,sentence_influence,n=np.ceil(len(sentences)*0.33)) |
|
|
|
|
| summarized_text = text_rank_ranker.get_summarized_text(summary_sentences) |
| |
| return summarized_text |
|
|
|
|
|
|