Auto-Qa / cleaned_text.py
fahmiaziz's picture
init
aeae383 verified
import re
import string
from typing import List
def normalize_item(item) -> str:
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(item))))
def remove_duplicates(items: List[str]) -> List[str]:
unique_items = []
normalized_unique_items = []
for item in items:
normalized_item = normalize_item(item)
if normalized_item not in normalized_unique_items:
unique_items.append(item)
normalized_unique_items.append(normalized_item)
return unique_items
def remove_distractors_duplicate_with_correct_answer(correct: str, distractors: List[str]) -> List[str]:
normalized_correct = normalize_item(correct)
filtered_distractors = []
for distractor in distractors:
if normalize_item(distractor) != normalized_correct:
filtered_distractors.append(distractor)
return filtered_distractors
def clean_text(text: str) -> str:
# remove brackets
cleaned_text = re.sub(r"\((.*?)\)", lambda L: "", text)
# remove square bracket
cleaned_text = re.sub(r"\[(.*?)\]", lambda L: "", cleaned_text)
# remove multiple space
cleaned_text = re.sub(" +", " ", cleaned_text)
# replace weird hypen
cleaned_text = cleaned_text.replace('–', '-')
return cleaned_text