Siddh12334's picture
feat: training space with manual start UI
204fa23 verified
import random
from typing import Any
from data.corruption import corrupt_text
SOURCES = [
"Encyclopedia Britannica",
"Reuters Fact Check",
"National Geographic",
"Smithsonian Magazine",
"BBC Reference Desk",
"Oxford Reference",
"World Almanac",
"Associated Press Archive",
"Library of Congress Notes",
"Academic Knowledge Base",
]
TEMPLATES = [
"{source} summarizes the question '{question}' and identifies the answer as {answer}.",
"In its reference entry, {source} states that the correct answer to '{question}' is {answer}.",
"{source} records {answer} as the accepted answer when asked: '{question}'",
"A background note from {source} explains that {answer} is the established response to '{question}'",
"According to {source}, researchers commonly answer '{question}' with {answer}.",
"{source} lists the verified answer for '{question}' as {answer}, matching standard references.",
"The archive maintained by {source} gives {answer} as the answer to '{question}'",
"For the prompt '{question}', {source} reports that the answer is {answer}.",
]
def _as_text(value: Any, default: str = "") -> str:
if value is None:
return default
text = str(value).strip()
return text or default
def generate_documents(
fact: dict[str, Any],
num_docs: int = 8,
corrupt_positions: list[int] | None = None,
) -> list[dict[str, Any]]:
question = _as_text(fact.get("question"), "Unknown question?")
answer = _as_text(fact.get("answer"), "unknown")
corrupt_set = set(corrupt_positions or [])
corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])}
documents: list[dict[str, Any]] = []
for doc_id in range(num_docs):
source = random.choice(SOURCES)
template = random.choice(TEMPLATES)
content = template.format(source=source, question=question, answer=answer)
is_corrupt = doc_id in corrupt_set
if is_corrupt:
level = min(corrupt_order[doc_id], 4)
content = corrupt_text(content, answer, level)
documents.append(
{
"id": doc_id,
"title": f"{source} Document {doc_id + 1}",
"content": content,
"is_corrupt": is_corrupt,
}
)
return documents