Spaces:

Siddh12334
/

context-corruption-env

Sleeping

App Files Files Community

context-corruption-env / data /generator.py

aagparekh

feat: implement data pipeline

e16c147 20 days ago

raw

history blame contribute delete

2.37 kB

	import random
	from typing import Any

	from data.corruption import corrupt_text


	SOURCES = [
	"Encyclopedia Britannica",
	"Reuters Fact Check",
	"National Geographic",
	"Smithsonian Magazine",
	"BBC Reference Desk",
	"Oxford Reference",
	"World Almanac",
	"Associated Press Archive",
	"Library of Congress Notes",
	"Academic Knowledge Base",
	]

	TEMPLATES = [
	"{source} summarizes the question '{question}' and identifies the answer as {answer}.",
	"In its reference entry, {source} states that the correct answer to '{question}' is {answer}.",
	"{source} records {answer} as the accepted answer when asked: '{question}'",
	"A background note from {source} explains that {answer} is the established response to '{question}'",
	"According to {source}, researchers commonly answer '{question}' with {answer}.",
	"{source} lists the verified answer for '{question}' as {answer}, matching standard references.",
	"The archive maintained by {source} gives {answer} as the answer to '{question}'",
	"For the prompt '{question}', {source} reports that the answer is {answer}.",
	]


	def _as_text(value: Any, default: str = "") -> str:
	if value is None:
	return default
	text = str(value).strip()
	return text or default


	def generate_documents(
	fact: dict[str, Any],
	num_docs: int = 8,
	corrupt_positions: list[int] \| None = None,
	) -> list[dict[str, Any]]:
	question = _as_text(fact.get("question"), "Unknown question?")
	answer = _as_text(fact.get("answer"), "unknown")
	corrupt_set = set(corrupt_positions or [])
	corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])}

	documents: list[dict[str, Any]] = []
	for doc_id in range(num_docs):
	source = random.choice(SOURCES)
	template = random.choice(TEMPLATES)
	content = template.format(source=source, question=question, answer=answer)
	is_corrupt = doc_id in corrupt_set

	if is_corrupt:
	level = min(corrupt_order[doc_id], 4)
	content = corrupt_text(content, answer, level)

	documents.append(
	{
	"id": doc_id,
	"title": f"{source} Document {doc_id + 1}",
	"content": content,
	"is_corrupt": is_corrupt,
	}
	)

	return documents