Spaces:

Siddh12334
/

context-corruption-training

Paused

File size: 6,373 Bytes

204fa23

import random
import re

try:
    from faker import Faker
except ModuleNotFoundError:
    Faker = None


class _FallbackFaker:
    def name(self) -> str:
        return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"])

    def last_name(self) -> str:
        return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"])

    def company(self) -> str:
        return random.choice(
            ["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"]
        )

    def word(self) -> str:
        return random.choice(["revised", "alternate", "disputed", "corrected"])


fake = Faker() if Faker else _FallbackFaker()

COUNTRIES = [
    "France",
    "Germany",
    "Brazil",
    "Japan",
    "Canada",
    "India",
    "Australia",
    "Kenya",
    "Mexico",
    "Norway",
]
CITIES = [
    "Paris",
    "Berlin",
    "Tokyo",
    "Toronto",
    "Mumbai",
    "Sydney",
    "Nairobi",
    "Mexico City",
    "Oslo",
    "Rome",
]
ORGANIZATIONS = [
    "World Health Organization",
    "United Nations",
    "NASA",
    "Oxford University",
    "Reuters",
    "Smithsonian Institution",
    "International Monetary Fund",
    "Royal Society",
]
ANTONYMS = {
    "largest": "smallest",
    "smallest": "largest",
    "first": "last",
    "last": "first",
    "highest": "lowest",
    "lowest": "highest",
    "won": "lost",
    "lost": "won",
    "north": "south",
    "south": "north",
    "east": "west",
    "west": "east",
    "increase": "decrease",
    "decrease": "increase",
    "before": "after",
    "after": "before",
    "true": "false",
    "false": "true",
    "older": "newer",
    "newer": "older",
    "major": "minor",
    "minor": "major",
}


def _preserve_case(original: str, replacement: str) -> str:
    if original.isupper():
        return replacement.upper()
    if original.istitle():
        return replacement.title()
    if original.islower():
        return replacement.lower()
    return replacement


def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str:
    pattern = re.compile(re.escape(target), re.IGNORECASE)

    def repl(match: re.Match[str]) -> str:
        return _preserve_case(match.group(0), replacement)

    return pattern.sub(repl, text, count=1)


def _different_choice(options: list[str], current: str) -> str:
    viable = [option for option in options if option.lower() != current.lower()]
    return random.choice(viable or options)


def corrupt_number(text: str, answer: str) -> str:
    numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text)
    if not numbers:
        return (
            f"{text} A later statistical revision changed the reported figure "
            f"from {answer} to {random.randint(12, 98)}."
        )

    original = random.choice(numbers)
    value = int(original)
    if len(original) == 4 and 1900 <= value <= 2030:
        replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20]))
    else:
        mutated = value * random.choice([0.5, 2, 3, 5, 10])
        replacement = str(max(1, int(round(mutated))))

    return text.replace(original, replacement, 1)


def corrupt_entity(text: str, answer: str) -> str:
    answer = answer.strip()
    pools = [COUNTRIES, CITIES, ORGANIZATIONS]
    if answer and re.search(re.escape(answer), text, re.IGNORECASE):
        for pool in pools:
            if answer in pool:
                replacement = _different_choice(pool, answer)
                return _replace_first_case_insensitive(text, answer, replacement)

        if len(answer.split()) <= 3:
            generated_names = [fake.name() for _ in range(8)]
            replacement = _different_choice(generated_names, answer)
            return _replace_first_case_insensitive(text, answer, replacement)

    return (
        f"{text} In a later archive note, researcher {fake.name()} attributed "
        f"the answer to {fake.name()} instead."
    )


def corrupt_inversion(text: str, answer: str) -> str:
    pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE)

    def repl(match: re.Match[str]) -> str:
        word = match.group(0)
        replacement = ANTONYMS[word.lower()]
        return _preserve_case(word, replacement)

    corrupted, count = pattern.subn(repl, text, count=1)
    if count:
        return corrupted

    return (
        f"{text} This statement contradicts earlier scholarly consensus, "
        f"which identified {answer} as incorrect."
    )


def _generate_wrong_answer(answer: str) -> str:
    answer = answer.strip()
    if not answer:
        return fake.word().title()

    number_match = re.search(r"\d+", answer)
    if number_match:
        original = number_match.group(0)
        mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5]))
        return answer.replace(original, mutated, 1)

    words = answer.split()
    if len(words) == 1 and words[0][:1].isupper():
        return fake.last_name()
    if len(words) > 1:
        shuffled = words[:]
        random.shuffle(shuffled)
        if shuffled != words:
            return " ".join(shuffled)
        return f"{answer} Institute"
    return fake.word()


def corrupt_coherent(text: str, answer: str) -> str:
    wrong_answer = _generate_wrong_answer(answer)
    year = random.randint(2015, 2025)
    org = fake.company()
    source = random.choice(
        [
            "a peer-reviewed survey",
            "an institutional archive",
            "a longitudinal review",
            "a Reuters-style fact check",
        ]
    )

    if answer and re.search(re.escape(answer), text, re.IGNORECASE):
        text = _replace_first_case_insensitive(text, answer, wrong_answer)

    return (
        f"{text} According to {source} released by {org} in {year}, the verified "
        f"answer is {wrong_answer}, based on revised primary-source evidence."
    )


def corrupt_text(text: str, answer: str, level: int) -> str:
    try:
        if level <= 1:
            return corrupt_number(text, answer)
        if level == 2:
            return corrupt_entity(text, answer)
        if level == 3:
            return corrupt_inversion(text, answer)
        return corrupt_coherent(text, answer)
    except Exception:
        return (
            f"{text} A conflicting secondary source reports a different answer "
            f"than {answer}."
        )