Spaces:

fahmiaziz
/

Auto-Qa

Sleeping

Auto-Qa / cleaned_text.py

init

aeae383 verified about 2 years ago

1.7 kB

	import re
	import string
	from typing import List


	def normalize_item(item) -> str:
	"""Lower text and remove punctuation, articles and extra whitespace."""
	def remove_articles(text):
	return re.sub(r'\b(a\|an\|the)\b', ' ', text)

	def white_space_fix(text):
	return ' '.join(text.split())

	def remove_punc(text):
	exclude = set(string.punctuation)
	return ''.join(ch for ch in text if ch not in exclude)

	def lower(text):
	return text.lower()

	return white_space_fix(remove_articles(remove_punc(lower(item))))


	def remove_duplicates(items: List[str]) -> List[str]:
	unique_items = []
	normalized_unique_items = []

	for item in items:
	normalized_item = normalize_item(item)

	if normalized_item not in normalized_unique_items:
	unique_items.append(item)
	normalized_unique_items.append(normalized_item)

	return unique_items

	def remove_distractors_duplicate_with_correct_answer(correct: str, distractors: List[str]) -> List[str]:
	normalized_correct = normalize_item(correct)

	filtered_distractors = []

	for distractor in distractors:
	if normalize_item(distractor) != normalized_correct:
	filtered_distractors.append(distractor)

	return filtered_distractors

	def clean_text(text: str) -> str:
	# remove brackets
	cleaned_text = re.sub(r"\((.*?)\)", lambda L: "", text)
	# remove square bracket
	cleaned_text = re.sub(r"\[(.*?)\]", lambda L: "", cleaned_text)
	# remove multiple space
	cleaned_text = re.sub(" +", " ", cleaned_text)
	# replace weird hypen
	cleaned_text = cleaned_text.replace('–', '-')

	return cleaned_text