"""This module handles all textual preprocessing tasks, all textual postprocessing tasks. @Author: Karthick T. Sharma """ import re # from deep_translator import GoogleTranslator # import nltk # from nltk.tokenize import sent_tokenize # nltk.download('punkt') def filter_text(context): """Remove all signs other than -,-,a-z,A-Z,0-9, and some symbols..... and remove all extra blank spaces. Args: text (str): input string for processing. Returns: str: processed string. """ text = context.strip() text = re.sub('[\u2010-\u2013]', '-', text) text = re.sub(r'[^a-zA-Z0-9\.,-?%&*()]', ' ', text) text = re.sub(' {2,}', ' ', text) return text def split_text(context, char_range=300): """Split the bulk input text into small chunks. Args: text (str): processed string to be splitted. Returns: list[str]: list of splitted corpus. """ bulk_text = filter_text(context=context) if len(bulk_text) <= char_range: return [bulk_text] splitted_texts = [] # split whole input into $(char_range) block of meaningful text. # (only split after an full stop has encountered) while len(bulk_text) > char_range: i = char_range while((i < len(bulk_text)) and (bulk_text[i] != '.')): i += 1 splitted_texts.append(bulk_text[:(i+1)]) bulk_text = bulk_text.replace(bulk_text[:(i+1)], "") return splitted_texts def change_format(false_ans): """Change s2v format to fair readable form. Remove '|,_' and toggle case. Args: false_ans (list[tuple(str,int)]): list of most similar words and their similiarity. Returns: list[str]: false_ans in fair-readable format. """ output = [] for result in false_ans: res = result[0].split('|') res = res[0].replace('_', ' ') res = res[0].upper() + res[1:] output.append(res) return output # def postprocess_summary(text): # """Postprocess the output of summarizer model for fair readable output. # Capitalize firt word of sentence. Put spaces in required place. # Args: # text (str): summarized text to processed. # Returns: # str: clean-human readable text. # """ # output = "" # for token in sent_tokenize(text): # token = token.capitalize() # output += " " + token # return output def postprocess_question(text): """Postprocess the output of question generation model for fair readable. Args: text (text): generated question to be processed. Returns: str: clean readable text. """ output = text.replace("question: ", "") output = output.strip() return output # Dịch vietnamese -> english # def vietnamese_to_english(text): # translator = GoogleTranslator(source='vi', target='en') # translated_text = translator.translate(text) # return translated_text # def english_to_vietnamese(text): # translator = GoogleTranslator(source='en', target='vi') # translated_text = translator.translate(text) # return translated_text # def get_all_summary(model, context): # """Generate summary of input corpus. # Args: # model (OnnxT5): T5 transformer for summarization. # context (str): Bunch of unprocessed text. # Returns: # tuple(list(str), list(str)): tuple of, list of summarized text chunks and list of # original text chuncks. # """ # summary = [] # splitted_text = model.preprocess_input(context) # for txt in splitted_text: # summary.append(model.summarize(txt)) # return summary, splitted_text # def get_all_questions(model, context, answer): # """Return list of generated questions. # Args: # model (OnnxT5): T5 transformer for question generation. # context (list(str)): list of context for generating questions. # answer (list(str)): list of answers for question which will be generated. # Returns: # list(str): list of questions within given context # """ # questions = [] # for cont, ans in zip(context, answer): # questions.append(model.generate(cont, ans)) # # squeezing the 2d list to 1d # return questions