gen-question / src /utils /text_process.py
linhnguyen02
set up to deploy in hugging face
42cffde
"""This module handles all textual preprocessing tasks, all textual postprocessing tasks.
@Author: Karthick T. Sharma
"""
import re
# from deep_translator import GoogleTranslator
# import nltk
# from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
def filter_text(context):
"""Remove all signs other than -,-,a-z,A-Z,0-9, and some symbols.....
and remove all extra blank spaces.
Args:
text (str): input string for processing.
Returns:
str: processed string.
"""
text = context.strip()
text = re.sub('[\u2010-\u2013]', '-', text)
text = re.sub(r'[^a-zA-Z0-9\.,-?%&*()]', ' ', text)
text = re.sub(' {2,}', ' ', text)
return text
def split_text(context, char_range=300):
"""Split the bulk input text into small chunks.
Args:
text (str): processed string to be splitted.
Returns:
list[str]: list of splitted corpus.
"""
bulk_text = filter_text(context=context)
if len(bulk_text) <= char_range:
return [bulk_text]
splitted_texts = []
# split whole input into $(char_range) block of meaningful text.
# (only split after an full stop has encountered)
while len(bulk_text) > char_range:
i = char_range
while((i < len(bulk_text)) and (bulk_text[i] != '.')):
i += 1
splitted_texts.append(bulk_text[:(i+1)])
bulk_text = bulk_text.replace(bulk_text[:(i+1)], "")
return splitted_texts
def change_format(false_ans):
"""Change s2v format to fair readable form. Remove '|,_' and toggle case.
Args:
false_ans (list[tuple(str,int)]): list of most similar words and their
similiarity.
Returns:
list[str]: false_ans in fair-readable format.
"""
output = []
for result in false_ans:
res = result[0].split('|')
res = res[0].replace('_', ' ')
res = res[0].upper() + res[1:]
output.append(res)
return output
# def postprocess_summary(text):
# """Postprocess the output of summarizer model for fair readable output.
# Capitalize firt word of sentence. Put spaces in required place.
# Args:
# text (str): summarized text to processed.
# Returns:
# str: clean-human readable text.
# """
# output = ""
# for token in sent_tokenize(text):
# token = token.capitalize()
# output += " " + token
# return output
def postprocess_question(text):
"""Postprocess the output of question generation model for fair readable.
Args:
text (text): generated question to be processed.
Returns:
str: clean readable text.
"""
output = text.replace("question: ", "")
output = output.strip()
return output
# Dịch vietnamese -> english
# def vietnamese_to_english(text):
# translator = GoogleTranslator(source='vi', target='en')
# translated_text = translator.translate(text)
# return translated_text
# def english_to_vietnamese(text):
# translator = GoogleTranslator(source='en', target='vi')
# translated_text = translator.translate(text)
# return translated_text
# def get_all_summary(model, context):
# """Generate summary of input corpus.
# Args:
# model (OnnxT5): T5 transformer for summarization.
# context (str): Bunch of unprocessed text.
# Returns:
# tuple(list(str), list(str)): tuple of, list of summarized text chunks and list of
# original text chuncks.
# """
# summary = []
# splitted_text = model.preprocess_input(context)
# for txt in splitted_text:
# summary.append(model.summarize(txt))
# return summary, splitted_text
# def get_all_questions(model, context, answer):
# """Return list of generated questions.
# Args:
# model (OnnxT5): T5 transformer for question generation.
# context (list(str)): list of context for generating questions.
# answer (list(str)): list of answers for question which will be generated.
# Returns:
# list(str): list of questions within given context
# """
# questions = []
# for cont, ans in zip(context, answer):
# questions.append(model.generate(cont, ans))
# # squeezing the 2d list to 1d
# return questions