Spaces:
Sleeping
Sleeping
File size: 4,294 Bytes
d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde d3530f3 42cffde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
"""This module handles all textual preprocessing tasks, all textual postprocessing tasks.
@Author: Karthick T. Sharma
"""
import re
# from deep_translator import GoogleTranslator
# import nltk
# from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
def filter_text(context):
"""Remove all signs other than -,-,a-z,A-Z,0-9, and some symbols.....
and remove all extra blank spaces.
Args:
text (str): input string for processing.
Returns:
str: processed string.
"""
text = context.strip()
text = re.sub('[\u2010-\u2013]', '-', text)
text = re.sub(r'[^a-zA-Z0-9\.,-?%&*()]', ' ', text)
text = re.sub(' {2,}', ' ', text)
return text
def split_text(context, char_range=300):
"""Split the bulk input text into small chunks.
Args:
text (str): processed string to be splitted.
Returns:
list[str]: list of splitted corpus.
"""
bulk_text = filter_text(context=context)
if len(bulk_text) <= char_range:
return [bulk_text]
splitted_texts = []
# split whole input into $(char_range) block of meaningful text.
# (only split after an full stop has encountered)
while len(bulk_text) > char_range:
i = char_range
while((i < len(bulk_text)) and (bulk_text[i] != '.')):
i += 1
splitted_texts.append(bulk_text[:(i+1)])
bulk_text = bulk_text.replace(bulk_text[:(i+1)], "")
return splitted_texts
def change_format(false_ans):
"""Change s2v format to fair readable form. Remove '|,_' and toggle case.
Args:
false_ans (list[tuple(str,int)]): list of most similar words and their
similiarity.
Returns:
list[str]: false_ans in fair-readable format.
"""
output = []
for result in false_ans:
res = result[0].split('|')
res = res[0].replace('_', ' ')
res = res[0].upper() + res[1:]
output.append(res)
return output
# def postprocess_summary(text):
# """Postprocess the output of summarizer model for fair readable output.
# Capitalize firt word of sentence. Put spaces in required place.
# Args:
# text (str): summarized text to processed.
# Returns:
# str: clean-human readable text.
# """
# output = ""
# for token in sent_tokenize(text):
# token = token.capitalize()
# output += " " + token
# return output
def postprocess_question(text):
"""Postprocess the output of question generation model for fair readable.
Args:
text (text): generated question to be processed.
Returns:
str: clean readable text.
"""
output = text.replace("question: ", "")
output = output.strip()
return output
# Dịch vietnamese -> english
# def vietnamese_to_english(text):
# translator = GoogleTranslator(source='vi', target='en')
# translated_text = translator.translate(text)
# return translated_text
# def english_to_vietnamese(text):
# translator = GoogleTranslator(source='en', target='vi')
# translated_text = translator.translate(text)
# return translated_text
# def get_all_summary(model, context):
# """Generate summary of input corpus.
# Args:
# model (OnnxT5): T5 transformer for summarization.
# context (str): Bunch of unprocessed text.
# Returns:
# tuple(list(str), list(str)): tuple of, list of summarized text chunks and list of
# original text chuncks.
# """
# summary = []
# splitted_text = model.preprocess_input(context)
# for txt in splitted_text:
# summary.append(model.summarize(txt))
# return summary, splitted_text
# def get_all_questions(model, context, answer):
# """Return list of generated questions.
# Args:
# model (OnnxT5): T5 transformer for question generation.
# context (list(str)): list of context for generating questions.
# answer (list(str)): list of answers for question which will be generated.
# Returns:
# list(str): list of questions within given context
# """
# questions = []
# for cont, ans in zip(context, answer):
# questions.append(model.generate(cont, ans))
# # squeezing the 2d list to 1d
# return questions
|