| | import re |
| | import string |
| | from typing import List |
| |
|
| |
|
| | def normalize_item(item) -> str: |
| | """Lower text and remove punctuation, articles and extra whitespace.""" |
| | def remove_articles(text): |
| | return re.sub(r'\b(a|an|the)\b', ' ', text) |
| |
|
| | def white_space_fix(text): |
| | return ' '.join(text.split()) |
| |
|
| | def remove_punc(text): |
| | exclude = set(string.punctuation) |
| | return ''.join(ch for ch in text if ch not in exclude) |
| |
|
| | def lower(text): |
| | return text.lower() |
| |
|
| | return white_space_fix(remove_articles(remove_punc(lower(item)))) |
| |
|
| |
|
| | def remove_duplicates(items: List[str]) -> List[str]: |
| | unique_items = [] |
| | normalized_unique_items = [] |
| |
|
| | for item in items: |
| | normalized_item = normalize_item(item) |
| |
|
| | if normalized_item not in normalized_unique_items: |
| | unique_items.append(item) |
| | normalized_unique_items.append(normalized_item) |
| |
|
| | return unique_items |
| | |
| | def remove_distractors_duplicate_with_correct_answer(correct: str, distractors: List[str]) -> List[str]: |
| | normalized_correct = normalize_item(correct) |
| |
|
| | filtered_distractors = [] |
| |
|
| | for distractor in distractors: |
| | if normalize_item(distractor) != normalized_correct: |
| | filtered_distractors.append(distractor) |
| |
|
| | return filtered_distractors |
| |
|
| | def clean_text(text: str) -> str: |
| | |
| | cleaned_text = re.sub(r"\((.*?)\)", lambda L: "", text) |
| | |
| | cleaned_text = re.sub(r"\[(.*?)\]", lambda L: "", cleaned_text) |
| | |
| | cleaned_text = re.sub(" +", " ", cleaned_text) |
| | |
| | cleaned_text = cleaned_text.replace('–', '-') |
| |
|
| | return cleaned_text |
| |
|