import random import re try: from faker import Faker except ModuleNotFoundError: Faker = None class _FallbackFaker: def name(self) -> str: return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"]) def last_name(self) -> str: return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"]) def company(self) -> str: return random.choice( ["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"] ) def word(self) -> str: return random.choice(["revised", "alternate", "disputed", "corrected"]) fake = Faker() if Faker else _FallbackFaker() COUNTRIES = [ "France", "Germany", "Brazil", "Japan", "Canada", "India", "Australia", "Kenya", "Mexico", "Norway", ] CITIES = [ "Paris", "Berlin", "Tokyo", "Toronto", "Mumbai", "Sydney", "Nairobi", "Mexico City", "Oslo", "Rome", ] ORGANIZATIONS = [ "World Health Organization", "United Nations", "NASA", "Oxford University", "Reuters", "Smithsonian Institution", "International Monetary Fund", "Royal Society", ] ANTONYMS = { "largest": "smallest", "smallest": "largest", "first": "last", "last": "first", "highest": "lowest", "lowest": "highest", "won": "lost", "lost": "won", "north": "south", "south": "north", "east": "west", "west": "east", "increase": "decrease", "decrease": "increase", "before": "after", "after": "before", "true": "false", "false": "true", "older": "newer", "newer": "older", "major": "minor", "minor": "major", } def _preserve_case(original: str, replacement: str) -> str: if original.isupper(): return replacement.upper() if original.istitle(): return replacement.title() if original.islower(): return replacement.lower() return replacement def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str: pattern = re.compile(re.escape(target), re.IGNORECASE) def repl(match: re.Match[str]) -> str: return _preserve_case(match.group(0), replacement) return pattern.sub(repl, text, count=1) def _different_choice(options: list[str], current: str) -> str: viable = [option for option in options if option.lower() != current.lower()] return random.choice(viable or options) def corrupt_number(text: str, answer: str) -> str: numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text) if not numbers: return ( f"{text} A later statistical revision changed the reported figure " f"from {answer} to {random.randint(12, 98)}." ) original = random.choice(numbers) value = int(original) if len(original) == 4 and 1900 <= value <= 2030: replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20])) else: mutated = value * random.choice([0.5, 2, 3, 5, 10]) replacement = str(max(1, int(round(mutated)))) return text.replace(original, replacement, 1) def corrupt_entity(text: str, answer: str) -> str: answer = answer.strip() pools = [COUNTRIES, CITIES, ORGANIZATIONS] if answer and re.search(re.escape(answer), text, re.IGNORECASE): for pool in pools: if answer in pool: replacement = _different_choice(pool, answer) return _replace_first_case_insensitive(text, answer, replacement) if len(answer.split()) <= 3: generated_names = [fake.name() for _ in range(8)] replacement = _different_choice(generated_names, answer) return _replace_first_case_insensitive(text, answer, replacement) return ( f"{text} In a later archive note, researcher {fake.name()} attributed " f"the answer to {fake.name()} instead." ) def corrupt_inversion(text: str, answer: str) -> str: pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE) def repl(match: re.Match[str]) -> str: word = match.group(0) replacement = ANTONYMS[word.lower()] return _preserve_case(word, replacement) corrupted, count = pattern.subn(repl, text, count=1) if count: return corrupted return ( f"{text} This statement contradicts earlier scholarly consensus, " f"which identified {answer} as incorrect." ) def _generate_wrong_answer(answer: str) -> str: answer = answer.strip() if not answer: return fake.word().title() number_match = re.search(r"\d+", answer) if number_match: original = number_match.group(0) mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5])) return answer.replace(original, mutated, 1) words = answer.split() if len(words) == 1 and words[0][:1].isupper(): return fake.last_name() if len(words) > 1: shuffled = words[:] random.shuffle(shuffled) if shuffled != words: return " ".join(shuffled) return f"{answer} Institute" return fake.word() def corrupt_coherent(text: str, answer: str) -> str: wrong_answer = _generate_wrong_answer(answer) year = random.randint(2015, 2025) org = fake.company() source = random.choice( [ "a peer-reviewed survey", "an institutional archive", "a longitudinal review", "a Reuters-style fact check", ] ) if answer and re.search(re.escape(answer), text, re.IGNORECASE): text = _replace_first_case_insensitive(text, answer, wrong_answer) return ( f"{text} According to {source} released by {org} in {year}, the verified " f"answer is {wrong_answer}, based on revised primary-source evidence." ) def corrupt_text(text: str, answer: str, level: int) -> str: try: if level <= 1: return corrupt_number(text, answer) if level == 2: return corrupt_entity(text, answer) if level == 3: return corrupt_inversion(text, answer) return corrupt_coherent(text, answer) except Exception: return ( f"{text} A conflicting secondary source reports a different answer " f"than {answer}." )