Spaces:

Siddh12334
/

context-corruption-env

Sleeping

App Files Files Community

context-corruption-env / data /corruption.py

aagparekh

feat: implement data pipeline

e16c147 19 days ago

raw

history blame contribute delete

6.37 kB

	import random
	import re

	try:
	from faker import Faker
	except ModuleNotFoundError:
	Faker = None


	class _FallbackFaker:
	def name(self) -> str:
	return random.choice(["Alex Morgan", "Jordan Lee", "Taylor Brooks", "Casey Patel"])

	def last_name(self) -> str:
	return random.choice(["Morgan", "Lee", "Brooks", "Patel", "Reed"])

	def company(self) -> str:
	return random.choice(
	["Global Research Institute", "Civic Data Group", "Archive Analytics Lab"]
	)

	def word(self) -> str:
	return random.choice(["revised", "alternate", "disputed", "corrected"])


	fake = Faker() if Faker else _FallbackFaker()

	COUNTRIES = [
	"France",
	"Germany",
	"Brazil",
	"Japan",
	"Canada",
	"India",
	"Australia",
	"Kenya",
	"Mexico",
	"Norway",
	]
	CITIES = [
	"Paris",
	"Berlin",
	"Tokyo",
	"Toronto",
	"Mumbai",
	"Sydney",
	"Nairobi",
	"Mexico City",
	"Oslo",
	"Rome",
	]
	ORGANIZATIONS = [
	"World Health Organization",
	"United Nations",
	"NASA",
	"Oxford University",
	"Reuters",
	"Smithsonian Institution",
	"International Monetary Fund",
	"Royal Society",
	]
	ANTONYMS = {
	"largest": "smallest",
	"smallest": "largest",
	"first": "last",
	"last": "first",
	"highest": "lowest",
	"lowest": "highest",
	"won": "lost",
	"lost": "won",
	"north": "south",
	"south": "north",
	"east": "west",
	"west": "east",
	"increase": "decrease",
	"decrease": "increase",
	"before": "after",
	"after": "before",
	"true": "false",
	"false": "true",
	"older": "newer",
	"newer": "older",
	"major": "minor",
	"minor": "major",
	}


	def _preserve_case(original: str, replacement: str) -> str:
	if original.isupper():
	return replacement.upper()
	if original.istitle():
	return replacement.title()
	if original.islower():
	return replacement.lower()
	return replacement


	def _replace_first_case_insensitive(text: str, target: str, replacement: str) -> str:
	pattern = re.compile(re.escape(target), re.IGNORECASE)

	def repl(match: re.Match[str]) -> str:
	return _preserve_case(match.group(0), replacement)

	return pattern.sub(repl, text, count=1)


	def _different_choice(options: list[str], current: str) -> str:
	viable = [option for option in options if option.lower() != current.lower()]
	return random.choice(viable or options)


	def corrupt_number(text: str, answer: str) -> str:
	numbers = re.findall(r"\b\d{4}\b\|\b\d+\b", text)
	if not numbers:
	return (
	f"{text} A later statistical revision changed the reported figure "
	f"from {answer} to {random.randint(12, 98)}."
	)

	original = random.choice(numbers)
	value = int(original)
	if len(original) == 4 and 1900 <= value <= 2030:
	replacement = str(value + random.choice([-20, -10, -5, 5, 10, 20]))
	else:
	mutated = value * random.choice([0.5, 2, 3, 5, 10])
	replacement = str(max(1, int(round(mutated))))

	return text.replace(original, replacement, 1)


	def corrupt_entity(text: str, answer: str) -> str:
	answer = answer.strip()
	pools = [COUNTRIES, CITIES, ORGANIZATIONS]
	if answer and re.search(re.escape(answer), text, re.IGNORECASE):
	for pool in pools:
	if answer in pool:
	replacement = _different_choice(pool, answer)
	return _replace_first_case_insensitive(text, answer, replacement)

	if len(answer.split()) <= 3:
	generated_names = [fake.name() for _ in range(8)]
	replacement = _different_choice(generated_names, answer)
	return _replace_first_case_insensitive(text, answer, replacement)

	return (
	f"{text} In a later archive note, researcher {fake.name()} attributed "
	f"the answer to {fake.name()} instead."
	)


	def corrupt_inversion(text: str, answer: str) -> str:
	pattern = re.compile(r"\b(" + "\|".join(map(re.escape, ANTONYMS)) + r")\b", re.IGNORECASE)

	def repl(match: re.Match[str]) -> str:
	word = match.group(0)
	replacement = ANTONYMS[word.lower()]
	return _preserve_case(word, replacement)

	corrupted, count = pattern.subn(repl, text, count=1)
	if count:
	return corrupted

	return (
	f"{text} This statement contradicts earlier scholarly consensus, "
	f"which identified {answer} as incorrect."
	)


	def _generate_wrong_answer(answer: str) -> str:
	answer = answer.strip()
	if not answer:
	return fake.word().title()

	number_match = re.search(r"\d+", answer)
	if number_match:
	original = number_match.group(0)
	mutated = str(int(original) + random.choice([-5, -2, -1, 1, 2, 5]))
	return answer.replace(original, mutated, 1)

	words = answer.split()
	if len(words) == 1 and words[0][:1].isupper():
	return fake.last_name()
	if len(words) > 1:
	shuffled = words[:]
	random.shuffle(shuffled)
	if shuffled != words:
	return " ".join(shuffled)
	return f"{answer} Institute"
	return fake.word()


	def corrupt_coherent(text: str, answer: str) -> str:
	wrong_answer = _generate_wrong_answer(answer)
	year = random.randint(2015, 2025)
	org = fake.company()
	source = random.choice(
	[
	"a peer-reviewed survey",
	"an institutional archive",
	"a longitudinal review",
	"a Reuters-style fact check",
	]
	)

	if answer and re.search(re.escape(answer), text, re.IGNORECASE):
	text = _replace_first_case_insensitive(text, answer, wrong_answer)

	return (
	f"{text} According to {source} released by {org} in {year}, the verified "
	f"answer is {wrong_answer}, based on revised primary-source evidence."
	)


	def corrupt_text(text: str, answer: str, level: int) -> str:
	try:
	if level <= 1:
	return corrupt_number(text, answer)
	if level == 2:
	return corrupt_entity(text, answer)
	if level == 3:
	return corrupt_inversion(text, answer)
	return corrupt_coherent(text, answer)
	except Exception:
	return (
	f"{text} A conflicting secondary source reports a different answer "
	f"than {answer}."
	)