import random from typing import Any from data.corruption import corrupt_text SOURCES = [ "Encyclopedia Britannica", "Reuters Fact Check", "National Geographic", "Smithsonian Magazine", "BBC Reference Desk", "Oxford Reference", "World Almanac", "Associated Press Archive", "Library of Congress Notes", "Academic Knowledge Base", ] TEMPLATES = [ "{source} summarizes the question '{question}' and identifies the answer as {answer}.", "In its reference entry, {source} states that the correct answer to '{question}' is {answer}.", "{source} records {answer} as the accepted answer when asked: '{question}'", "A background note from {source} explains that {answer} is the established response to '{question}'", "According to {source}, researchers commonly answer '{question}' with {answer}.", "{source} lists the verified answer for '{question}' as {answer}, matching standard references.", "The archive maintained by {source} gives {answer} as the answer to '{question}'", "For the prompt '{question}', {source} reports that the answer is {answer}.", ] def _as_text(value: Any, default: str = "") -> str: if value is None: return default text = str(value).strip() return text or default def generate_documents( fact: dict[str, Any], num_docs: int = 8, corrupt_positions: list[int] | None = None, ) -> list[dict[str, Any]]: question = _as_text(fact.get("question"), "Unknown question?") answer = _as_text(fact.get("answer"), "unknown") corrupt_set = set(corrupt_positions or []) corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])} documents: list[dict[str, Any]] = [] for doc_id in range(num_docs): source = random.choice(SOURCES) template = random.choice(TEMPLATES) content = template.format(source=source, question=question, answer=answer) is_corrupt = doc_id in corrupt_set if is_corrupt: level = min(corrupt_order[doc_id], 4) content = corrupt_text(content, answer, level) documents.append( { "id": doc_id, "title": f"{source} Document {doc_id + 1}", "content": content, "is_corrupt": is_corrupt, } ) return documents