Spaces:
Sleeping
Sleeping
| import random | |
| import re | |
| try: | |
| from faker import Faker | |
| except ModuleNotFoundError: | |
| Faker = None | |
| class _FallbackFaker: | |
| def name(self) -> str: | |
| return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"]) | |
| def last_name(self) -> str: | |
| return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"]) | |
| def company(self) -> str: | |
| return random.choice( | |
| ["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"] | |
| ) | |
| def word(self) -> str: | |
| return random.choice(["revised", "alternate", "disputed", "corrected"]) | |
| fake = Faker() if Faker else _FallbackFaker() | |
| COUNTRIES = [ | |
| "France", | |
| "Germany", | |
| "Brazil", | |
| "Japan", | |
| "Canada", | |
| "India", | |
| "Australia", | |
| "Kenya", | |
| "Mexico", | |
| "Norway", | |
| ] | |
| CITIES = [ | |
| "Paris", | |
| "Berlin", | |
| "Tokyo", | |
| "Toronto", | |
| "Mumbai", | |
| "Sydney", | |
| "Nairobi", | |
| "Mexico City", | |
| "Oslo", | |
| "Rome", | |
| ] | |
| ORGANIZATIONS = [ | |
| "World Health Organization", | |
| "United Nations", | |
| "NASA", | |
| "Oxford University", | |
| "Reuters", | |
| "Smithsonian Institution", | |
| "International Monetary Fund", | |
| "Royal Society", | |
| ] | |
| ANTONYMS = { | |
| "largest": "smallest", | |
| "smallest": "largest", | |
| "first": "last", | |
| "last": "first", | |
| "highest": "lowest", | |
| "lowest": "highest", | |
| "won": "lost", | |
| "lost": "won", | |
| "north": "south", | |
| "south": "north", | |
| "east": "west", | |
| "west": "east", | |
| "increase": "decrease", | |
| "decrease": "increase", | |
| "before": "after", | |
| "after": "before", | |
| "true": "false", | |
| "false": "true", | |
| "older": "newer", | |
| "newer": "older", | |
| "major": "minor", | |
| "minor": "major", | |
| } | |
| def _preserve_case(original: str, replacement: str) -> str: | |
| if original.isupper(): | |
| return replacement.upper() | |
| if original.istitle(): | |
| return replacement.title() | |
| if original.islower(): | |
| return replacement.lower() | |
| return replacement | |
| def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str: | |
| pattern = re.compile(re.escape(target), re.IGNORECASE) | |
| def repl(match: re.Match[str]) -> str: | |
| return _preserve_case(match.group(0), replacement) | |
| return pattern.sub(repl, text, count=1) | |
| def _different_choice(options: list[str], current: str) -> str: | |
| viable = [option for option in options if option.lower() != current.lower()] | |
| return random.choice(viable or options) | |
| def corrupt_number(text: str, answer: str) -> str: | |
| numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text) | |
| if not numbers: | |
| return ( | |
| f"{text} A later statistical revision changed the reported figure " | |
| f"from {answer} to {random.randint(12, 98)}." | |
| ) | |
| original = random.choice(numbers) | |
| value = int(original) | |
| if len(original) == 4 and 1900 <= value <= 2030: | |
| replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20])) | |
| else: | |
| mutated = value * random.choice([0.5, 2, 3, 5, 10]) | |
| replacement = str(max(1, int(round(mutated)))) | |
| return text.replace(original, replacement, 1) | |
| def corrupt_entity(text: str, answer: str) -> str: | |
| answer = answer.strip() | |
| pools = [COUNTRIES, CITIES, ORGANIZATIONS] | |
| if answer and re.search(re.escape(answer), text, re.IGNORECASE): | |
| for pool in pools: | |
| if answer in pool: | |
| replacement = _different_choice(pool, answer) | |
| return _replace_first_case_insensitive(text, answer, replacement) | |
| if len(answer.split()) <= 3: | |
| generated_names = [fake.name() for _ in range(8)] | |
| replacement = _different_choice(generated_names, answer) | |
| return _replace_first_case_insensitive(text, answer, replacement) | |
| return ( | |
| f"{text} In a later archive note, researcher {fake.name()} attributed " | |
| f"the answer to {fake.name()} instead." | |
| ) | |
| def corrupt_inversion(text: str, answer: str) -> str: | |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE) | |
| def repl(match: re.Match[str]) -> str: | |
| word = match.group(0) | |
| replacement = ANTONYMS[word.lower()] | |
| return _preserve_case(word, replacement) | |
| corrupted, count = pattern.subn(repl, text, count=1) | |
| if count: | |
| return corrupted | |
| return ( | |
| f"{text} This statement contradicts earlier scholarly consensus, " | |
| f"which identified {answer} as incorrect." | |
| ) | |
| def _generate_wrong_answer(answer: str) -> str: | |
| answer = answer.strip() | |
| if not answer: | |
| return fake.word().title() | |
| number_match = re.search(r"\d+", answer) | |
| if number_match: | |
| original = number_match.group(0) | |
| mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5])) | |
| return answer.replace(original, mutated, 1) | |
| words = answer.split() | |
| if len(words) == 1 and words[0][:1].isupper(): | |
| return fake.last_name() | |
| if len(words) > 1: | |
| shuffled = words[:] | |
| random.shuffle(shuffled) | |
| if shuffled != words: | |
| return " ".join(shuffled) | |
| return f"{answer} Institute" | |
| return fake.word() | |
| def corrupt_coherent(text: str, answer: str) -> str: | |
| wrong_answer = _generate_wrong_answer(answer) | |
| year = random.randint(2015, 2025) | |
| org = fake.company() | |
| source = random.choice( | |
| [ | |
| "a peer-reviewed survey", | |
| "an institutional archive", | |
| "a longitudinal review", | |
| "a Reuters-style fact check", | |
| ] | |
| ) | |
| if answer and re.search(re.escape(answer), text, re.IGNORECASE): | |
| text = _replace_first_case_insensitive(text, answer, wrong_answer) | |
| return ( | |
| f"{text} According to {source} released by {org} in {year}, the verified " | |
| f"answer is {wrong_answer}, based on revised primary-source evidence." | |
| ) | |
| def corrupt_text(text: str, answer: str, level: int) -> str: | |
| try: | |
| if level <= 1: | |
| return corrupt_number(text, answer) | |
| if level == 2: | |
| return corrupt_entity(text, answer) | |
| if level == 3: | |
| return corrupt_inversion(text, answer) | |
| return corrupt_coherent(text, answer) | |
| except Exception: | |
| return ( | |
| f"{text} A conflicting secondary source reports a different answer " | |
| f"than {answer}." | |
| ) | |