File size: 4,294 Bytes
d3530f3
 
 
 
 
 
42cffde
 
 
 
d3530f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42cffde
 
d3530f3
42cffde
d3530f3
42cffde
 
d3530f3
42cffde
 
 
 
d3530f3
42cffde
 
 
 
d3530f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42cffde
 
 
 
d3530f3
42cffde
 
 
 
d3530f3
 
42cffde
 
d3530f3
42cffde
 
 
d3530f3
42cffde
 
 
 
 
 
d3530f3
42cffde
 
d3530f3
42cffde
d3530f3
 
42cffde
 
d3530f3
42cffde
 
 
 
d3530f3
42cffde
 
 
 
d3530f3
42cffde
 
d3530f3
42cffde
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""This module handles all textual preprocessing tasks, all textual postprocessing tasks.

@Author: Karthick T. Sharma
"""

import re
# from deep_translator import GoogleTranslator
# import nltk
# from nltk.tokenize import sent_tokenize
# nltk.download('punkt')



def filter_text(context):
    """Remove all signs other than -,-,a-z,A-Z,0-9, and some symbols.....
    and remove all extra blank spaces.

    Args:
        text (str): input string for processing.

    Returns:
        str: processed string.
    """
    text = context.strip()
    text = re.sub('[\u2010-\u2013]', '-', text)
    text = re.sub(r'[^a-zA-Z0-9\.,-?%&*()]', ' ', text)
    text = re.sub(' {2,}', ' ', text)
    return text


def split_text(context, char_range=300):
    """Split the bulk input text into small chunks.

    Args:
        text (str): processed string to be splitted.

    Returns:
        list[str]: list of splitted corpus.
    """
    bulk_text = filter_text(context=context)

    if len(bulk_text) <= char_range:
        return [bulk_text]

    splitted_texts = []
    # split whole input into $(char_range) block of meaningful text.
    # (only split after an full stop has encountered)
    while len(bulk_text) > char_range:
        i = char_range
        while((i < len(bulk_text)) and (bulk_text[i] != '.')):
            i += 1
        splitted_texts.append(bulk_text[:(i+1)])
        bulk_text = bulk_text.replace(bulk_text[:(i+1)], "")
    return splitted_texts


def change_format(false_ans):
    """Change s2v format to fair readable form. Remove '|,_' and toggle case.

    Args:
        false_ans (list[tuple(str,int)]): list of most similar words and their
        similiarity.

    Returns:
        list[str]: false_ans in fair-readable format.
    """
    output = []
    for result in false_ans:
        res = result[0].split('|')
        res = res[0].replace('_', ' ')
        res = res[0].upper() + res[1:]
        output.append(res)
    return output

# def postprocess_summary(text):
#     """Postprocess the output of summarizer model for fair readable output.

#        Capitalize firt word of sentence. Put spaces in required place.

#     Args:
#         text (str): summarized text to processed.

#     Returns:
#         str: clean-human readable text.
#     """
#     output = ""

#     for token in sent_tokenize(text):
#         token = token.capitalize()
#         output += " " + token
#     return output


def postprocess_question(text):
    """Postprocess the output of question generation model for fair readable.

    Args:
        text (text): generated question to be processed.

    Returns:
        str: clean readable text.
    """
    output = text.replace("question: ", "")
    output = output.strip()
    return output

# Dịch vietnamese -> english
# def vietnamese_to_english(text):
#     translator = GoogleTranslator(source='vi', target='en')
#     translated_text = translator.translate(text)
#     return translated_text

# def english_to_vietnamese(text):
#     translator = GoogleTranslator(source='en', target='vi')
#     translated_text = translator.translate(text)
#     return translated_text


# def get_all_summary(model, context):
#     """Generate summary of input corpus.

#     Args:
#         model (OnnxT5): T5 transformer for summarization.
#         context (str): Bunch of unprocessed text.

#     Returns:
#         tuple(list(str), list(str)): tuple of, list of summarized text chunks and list of
#         original text chuncks.
#     """
#     summary = []
#     splitted_text = model.preprocess_input(context)

#     for txt in splitted_text:
#         summary.append(model.summarize(txt))

#     return summary, splitted_text


# def get_all_questions(model, context, answer):
#     """Return list of generated questions.

#     Args:
#         model (OnnxT5): T5 transformer for question generation.
#         context (list(str)): list of context for generating questions.
#         answer (list(str)): list of answers for question which will be generated.

#     Returns:
#         list(str): list of questions within given context
#     """
#     questions = []

#     for cont, ans in zip(context, answer):
#         questions.append(model.generate(cont, ans))

#     # squeezing the 2d list to 1d
#     return questions