Spaces:

kbl2810
/

gen-question

Sleeping

File size: 4,294 Bytes

d3530f3
 
 
 
 
 
42cffde
 
 
 
d3530f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42cffde
 
d3530f3
42cffde
d3530f3
42cffde
 
d3530f3
42cffde
 
 
 
d3530f3
42cffde
 
 
 
d3530f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42cffde
 
 
 
d3530f3
42cffde
 
 
 
d3530f3
 
42cffde
 
d3530f3
42cffde
 
 
d3530f3
42cffde
 
 
 
 
 
d3530f3
42cffde
 
d3530f3
42cffde
d3530f3
 
42cffde
 
d3530f3
42cffde
 
 
 
d3530f3
42cffde
 
 
 
d3530f3
42cffde
 
d3530f3
42cffde

"""This module handles all textual preprocessing tasks, all textual postprocessing tasks.

@Author: Karthick T. Sharma
"""

import re
# from deep_translator import GoogleTranslator
# import nltk
# from nltk.tokenize import sent_tokenize
# nltk.download('punkt')



def filter_text(context):
    """Remove all signs other than -,-,a-z,A-Z,0-9, and some symbols.....
    and remove all extra blank spaces.

    Args:
        text (str): input string for processing.

    Returns:
        str: processed string.
    """
    text = context.strip()
    text = re.sub('[\u2010-\u2013]', '-', text)
    text = re.sub(r'[^a-zA-Z0-9\.,-?%&*()]', ' ', text)
    text = re.sub(' {2,}', ' ', text)
    return text


def split_text(context, char_range=300):
    """Split the bulk input text into small chunks.

    Args:
        text (str): processed string to be splitted.

    Returns:
        list[str]: list of splitted corpus.
    """
    bulk_text = filter_text(context=context)

    if len(bulk_text) <= char_range:
        return [bulk_text]

    splitted_texts = []
    # split whole input into $(char_range) block of meaningful text.
    # (only split after an full stop has encountered)
    while len(bulk_text) > char_range:
        i = char_range
        while((i < len(bulk_text)) and (bulk_text[i] != '.')):
            i += 1
        splitted_texts.append(bulk_text[:(i+1)])
        bulk_text = bulk_text.replace(bulk_text[:(i+1)], "")
    return splitted_texts


def change_format(false_ans):
    """Change s2v format to fair readable form. Remove '|,_' and toggle case.

    Args:
        false_ans (list[tuple(str,int)]): list of most similar words and their
        similiarity.

    Returns:
        list[str]: false_ans in fair-readable format.
    """
    output = []
    for result in false_ans:
        res = result[0].split('|')
        res = res[0].replace('_', ' ')
        res = res[0].upper() + res[1:]
        output.append(res)
    return output

# def postprocess_summary(text):
#     """Postprocess the output of summarizer model for fair readable output.

#        Capitalize firt word of sentence. Put spaces in required place.

#     Args:
#         text (str): summarized text to processed.

#     Returns:
#         str: clean-human readable text.
#     """
#     output = ""

#     for token in sent_tokenize(text):
#         token = token.capitalize()
#         output += " " + token
#     return output


def postprocess_question(text):
    """Postprocess the output of question generation model for fair readable.

    Args:
        text (text): generated question to be processed.

    Returns:
        str: clean readable text.
    """
    output = text.replace("question: ", "")
    output = output.strip()
    return output

# Dịch vietnamese -> english
# def vietnamese_to_english(text):
#     translator = GoogleTranslator(source='vi', target='en')
#     translated_text = translator.translate(text)
#     return translated_text

# def english_to_vietnamese(text):
#     translator = GoogleTranslator(source='en', target='vi')
#     translated_text = translator.translate(text)
#     return translated_text


# def get_all_summary(model, context):
#     """Generate summary of input corpus.

#     Args:
#         model (OnnxT5): T5 transformer for summarization.
#         context (str): Bunch of unprocessed text.

#     Returns:
#         tuple(list(str), list(str)): tuple of, list of summarized text chunks and list of
#         original text chuncks.
#     """
#     summary = []
#     splitted_text = model.preprocess_input(context)

#     for txt in splitted_text:
#         summary.append(model.summarize(txt))

#     return summary, splitted_text


# def get_all_questions(model, context, answer):
#     """Return list of generated questions.

#     Args:
#         model (OnnxT5): T5 transformer for question generation.
#         context (list(str)): list of context for generating questions.
#         answer (list(str)): list of answers for question which will be generated.

#     Returns:
#         list(str): list of questions within given context
#     """
#     questions = []

#     for cont, ans in zip(context, answer):
#         questions.append(model.generate(cont, ans))

#     # squeezing the 2d list to 1d
#     return questions