Spaces:
Sleeping
Sleeping
| import random | |
| from typing import Any | |
| from data.corruption import corrupt_text | |
| SOURCES = [ | |
| "Encyclopedia Britannica", | |
| "Reuters Fact Check", | |
| "National Geographic", | |
| "Smithsonian Magazine", | |
| "BBC Reference Desk", | |
| "Oxford Reference", | |
| "World Almanac", | |
| "Associated Press Archive", | |
| "Library of Congress Notes", | |
| "Academic Knowledge Base", | |
| ] | |
| TEMPLATES = [ | |
| "{source} summarizes the question '{question}' and identifies the answer as {answer}.", | |
| "In its reference entry, {source} states that the correct answer to '{question}' is {answer}.", | |
| "{source} records {answer} as the accepted answer when asked: '{question}'", | |
| "A background note from {source} explains that {answer} is the established response to '{question}'", | |
| "According to {source}, researchers commonly answer '{question}' with {answer}.", | |
| "{source} lists the verified answer for '{question}' as {answer}, matching standard references.", | |
| "The archive maintained by {source} gives {answer} as the answer to '{question}'", | |
| "For the prompt '{question}', {source} reports that the answer is {answer}.", | |
| ] | |
| def _as_text(value: Any, default: str = "") -> str: | |
| if value is None: | |
| return default | |
| text = str(value).strip() | |
| return text or default | |
| def generate_documents( | |
| fact: dict[str, Any], | |
| num_docs: int = 8, | |
| corrupt_positions: list[int] | None = None, | |
| ) -> list[dict[str, Any]]: | |
| question = _as_text(fact.get("question"), "Unknown question?") | |
| answer = _as_text(fact.get("answer"), "unknown") | |
| corrupt_set = set(corrupt_positions or []) | |
| corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])} | |
| documents: list[dict[str, Any]] = [] | |
| for doc_id in range(num_docs): | |
| source = random.choice(SOURCES) | |
| template = random.choice(TEMPLATES) | |
| content = template.format(source=source, question=question, answer=answer) | |
| is_corrupt = doc_id in corrupt_set | |
| if is_corrupt: | |
| level = min(corrupt_order[doc_id], 4) | |
| content = corrupt_text(content, answer, level) | |
| documents.append( | |
| { | |
| "id": doc_id, | |
| "title": f"{source} Document {doc_id + 1}", | |
| "content": content, | |
| "is_corrupt": is_corrupt, | |
| } | |
| ) | |
| return documents | |