Spaces:
Running
Running
| """This module handles all textual preprocessing tasks, all textual postprocessing tasks. | |
| @Author: Karthick T. Sharma | |
| """ | |
| import re | |
| # from deep_translator import GoogleTranslator | |
| # import nltk | |
| # from nltk.tokenize import sent_tokenize | |
| # nltk.download('punkt') | |
| def filter_text(context): | |
| """Remove all signs other than -,-,a-z,A-Z,0-9, and some symbols..... | |
| and remove all extra blank spaces. | |
| Args: | |
| text (str): input string for processing. | |
| Returns: | |
| str: processed string. | |
| """ | |
| text = context.strip() | |
| text = re.sub('[\u2010-\u2013]', '-', text) | |
| text = re.sub(r'[^a-zA-Z0-9\.,-?%&*()]', ' ', text) | |
| text = re.sub(' {2,}', ' ', text) | |
| return text | |
| def split_text(context, char_range=300): | |
| """Split the bulk input text into small chunks. | |
| Args: | |
| text (str): processed string to be splitted. | |
| Returns: | |
| list[str]: list of splitted corpus. | |
| """ | |
| bulk_text = filter_text(context=context) | |
| if len(bulk_text) <= char_range: | |
| return [bulk_text] | |
| splitted_texts = [] | |
| # split whole input into $(char_range) block of meaningful text. | |
| # (only split after an full stop has encountered) | |
| while len(bulk_text) > char_range: | |
| i = char_range | |
| while((i < len(bulk_text)) and (bulk_text[i] != '.')): | |
| i += 1 | |
| splitted_texts.append(bulk_text[:(i+1)]) | |
| bulk_text = bulk_text.replace(bulk_text[:(i+1)], "") | |
| return splitted_texts | |
| def change_format(false_ans): | |
| """Change s2v format to fair readable form. Remove '|,_' and toggle case. | |
| Args: | |
| false_ans (list[tuple(str,int)]): list of most similar words and their | |
| similiarity. | |
| Returns: | |
| list[str]: false_ans in fair-readable format. | |
| """ | |
| output = [] | |
| for result in false_ans: | |
| res = result[0].split('|') | |
| res = res[0].replace('_', ' ') | |
| res = res[0].upper() + res[1:] | |
| output.append(res) | |
| return output | |
| # def postprocess_summary(text): | |
| # """Postprocess the output of summarizer model for fair readable output. | |
| # Capitalize firt word of sentence. Put spaces in required place. | |
| # Args: | |
| # text (str): summarized text to processed. | |
| # Returns: | |
| # str: clean-human readable text. | |
| # """ | |
| # output = "" | |
| # for token in sent_tokenize(text): | |
| # token = token.capitalize() | |
| # output += " " + token | |
| # return output | |
| def postprocess_question(text): | |
| """Postprocess the output of question generation model for fair readable. | |
| Args: | |
| text (text): generated question to be processed. | |
| Returns: | |
| str: clean readable text. | |
| """ | |
| output = text.replace("question: ", "") | |
| output = output.strip() | |
| return output | |
| # Dịch vietnamese -> english | |
| # def vietnamese_to_english(text): | |
| # translator = GoogleTranslator(source='vi', target='en') | |
| # translated_text = translator.translate(text) | |
| # return translated_text | |
| # def english_to_vietnamese(text): | |
| # translator = GoogleTranslator(source='en', target='vi') | |
| # translated_text = translator.translate(text) | |
| # return translated_text | |
| # def get_all_summary(model, context): | |
| # """Generate summary of input corpus. | |
| # Args: | |
| # model (OnnxT5): T5 transformer for summarization. | |
| # context (str): Bunch of unprocessed text. | |
| # Returns: | |
| # tuple(list(str), list(str)): tuple of, list of summarized text chunks and list of | |
| # original text chuncks. | |
| # """ | |
| # summary = [] | |
| # splitted_text = model.preprocess_input(context) | |
| # for txt in splitted_text: | |
| # summary.append(model.summarize(txt)) | |
| # return summary, splitted_text | |
| # def get_all_questions(model, context, answer): | |
| # """Return list of generated questions. | |
| # Args: | |
| # model (OnnxT5): T5 transformer for question generation. | |
| # context (list(str)): list of context for generating questions. | |
| # answer (list(str)): list of answers for question which will be generated. | |
| # Returns: | |
| # list(str): list of questions within given context | |
| # """ | |
| # questions = [] | |
| # for cont, ans in zip(context, answer): | |
| # questions.append(model.generate(cont, ans)) | |
| # # squeezing the 2d list to 1d | |
| # return questions | |