File size: 6,373 Bytes
204fa23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | import random
import re
try:
from faker import Faker
except ModuleNotFoundError:
Faker = None
class _FallbackFaker:
def name(self) -> str:
return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"])
def last_name(self) -> str:
return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"])
def company(self) -> str:
return random.choice(
["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"]
)
def word(self) -> str:
return random.choice(["revised", "alternate", "disputed", "corrected"])
fake = Faker() if Faker else _FallbackFaker()
COUNTRIES = [
"France",
"Germany",
"Brazil",
"Japan",
"Canada",
"India",
"Australia",
"Kenya",
"Mexico",
"Norway",
]
CITIES = [
"Paris",
"Berlin",
"Tokyo",
"Toronto",
"Mumbai",
"Sydney",
"Nairobi",
"Mexico City",
"Oslo",
"Rome",
]
ORGANIZATIONS = [
"World Health Organization",
"United Nations",
"NASA",
"Oxford University",
"Reuters",
"Smithsonian Institution",
"International Monetary Fund",
"Royal Society",
]
ANTONYMS = {
"largest": "smallest",
"smallest": "largest",
"first": "last",
"last": "first",
"highest": "lowest",
"lowest": "highest",
"won": "lost",
"lost": "won",
"north": "south",
"south": "north",
"east": "west",
"west": "east",
"increase": "decrease",
"decrease": "increase",
"before": "after",
"after": "before",
"true": "false",
"false": "true",
"older": "newer",
"newer": "older",
"major": "minor",
"minor": "major",
}
def _preserve_case(original: str, replacement: str) -> str:
if original.isupper():
return replacement.upper()
if original.istitle():
return replacement.title()
if original.islower():
return replacement.lower()
return replacement
def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str:
pattern = re.compile(re.escape(target), re.IGNORECASE)
def repl(match: re.Match[str]) -> str:
return _preserve_case(match.group(0), replacement)
return pattern.sub(repl, text, count=1)
def _different_choice(options: list[str], current: str) -> str:
viable = [option for option in options if option.lower() != current.lower()]
return random.choice(viable or options)
def corrupt_number(text: str, answer: str) -> str:
numbers = re.findall(r"\b\d{4}\b|\b\d+\b", text)
if not numbers:
return (
f"{text} A later statistical revision changed the reported figure "
f"from {answer} to {random.randint(12, 98)}."
)
original = random.choice(numbers)
value = int(original)
if len(original) == 4 and 1900 <= value <= 2030:
replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20]))
else:
mutated = value * random.choice([0.5, 2, 3, 5, 10])
replacement = str(max(1, int(round(mutated))))
return text.replace(original, replacement, 1)
def corrupt_entity(text: str, answer: str) -> str:
answer = answer.strip()
pools = [COUNTRIES, CITIES, ORGANIZATIONS]
if answer and re.search(re.escape(answer), text, re.IGNORECASE):
for pool in pools:
if answer in pool:
replacement = _different_choice(pool, answer)
return _replace_first_case_insensitive(text, answer, replacement)
if len(answer.split()) <= 3:
generated_names = [fake.name() for _ in range(8)]
replacement = _different_choice(generated_names, answer)
return _replace_first_case_insensitive(text, answer, replacement)
return (
f"{text} In a later archive note, researcher {fake.name()} attributed "
f"the answer to {fake.name()} instead."
)
def corrupt_inversion(text: str, answer: str) -> str:
pattern = re.compile(r"\b(" + "|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE)
def repl(match: re.Match[str]) -> str:
word = match.group(0)
replacement = ANTONYMS[word.lower()]
return _preserve_case(word, replacement)
corrupted, count = pattern.subn(repl, text, count=1)
if count:
return corrupted
return (
f"{text} This statement contradicts earlier scholarly consensus, "
f"which identified {answer} as incorrect."
)
def _generate_wrong_answer(answer: str) -> str:
answer = answer.strip()
if not answer:
return fake.word().title()
number_match = re.search(r"\d+", answer)
if number_match:
original = number_match.group(0)
mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5]))
return answer.replace(original, mutated, 1)
words = answer.split()
if len(words) == 1 and words[0][:1].isupper():
return fake.last_name()
if len(words) > 1:
shuffled = words[:]
random.shuffle(shuffled)
if shuffled != words:
return " ".join(shuffled)
return f"{answer} Institute"
return fake.word()
def corrupt_coherent(text: str, answer: str) -> str:
wrong_answer = _generate_wrong_answer(answer)
year = random.randint(2015, 2025)
org = fake.company()
source = random.choice(
[
"a peer-reviewed survey",
"an institutional archive",
"a longitudinal review",
"a Reuters-style fact check",
]
)
if answer and re.search(re.escape(answer), text, re.IGNORECASE):
text = _replace_first_case_insensitive(text, answer, wrong_answer)
return (
f"{text} According to {source} released by {org} in {year}, the verified "
f"answer is {wrong_answer}, based on revised primary-source evidence."
)
def corrupt_text(text: str, answer: str, level: int) -> str:
try:
if level <= 1:
return corrupt_number(text, answer)
if level == 2:
return corrupt_entity(text, answer)
if level == 3:
return corrupt_inversion(text, answer)
return corrupt_coherent(text, answer)
except Exception:
return (
f"{text} A conflicting secondary source reports a different answer "
f"than {answer}."
)
|