| import random |
| from typing import Any |
|
|
| from data.corruption import corrupt_text |
|
|
|
|
| SOURCES = [ |
| "Encyclopedia Britannica", |
| "Reuters Fact Check", |
| "National Geographic", |
| "Smithsonian Magazine", |
| "BBC Reference Desk", |
| "Oxford Reference", |
| "World Almanac", |
| "Associated Press Archive", |
| "Library of Congress Notes", |
| "Academic Knowledge Base", |
| ] |
|
|
| TEMPLATES = [ |
| "{source} summarizes the question '{question}' and identifies the answer as {answer}.", |
| "In its reference entry, {source} states that the correct answer to '{question}' is {answer}.", |
| "{source} records {answer} as the accepted answer when asked: '{question}'", |
| "A background note from {source} explains that {answer} is the established response to '{question}'", |
| "According to {source}, researchers commonly answer '{question}' with {answer}.", |
| "{source} lists the verified answer for '{question}' as {answer}, matching standard references.", |
| "The archive maintained by {source} gives {answer} as the answer to '{question}'", |
| "For the prompt '{question}', {source} reports that the answer is {answer}.", |
| ] |
|
|
|
|
| def _as_text(value: Any, default: str = "") -> str: |
| if value is None: |
| return default |
| text = str(value).strip() |
| return text or default |
|
|
|
|
| def generate_documents( |
| fact: dict[str, Any], |
| num_docs: int = 8, |
| corrupt_positions: list[int] | None = None, |
| ) -> list[dict[str, Any]]: |
| question = _as_text(fact.get("question"), "Unknown question?") |
| answer = _as_text(fact.get("answer"), "unknown") |
| corrupt_set = set(corrupt_positions or []) |
| corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])} |
|
|
| documents: list[dict[str, Any]] = [] |
| for doc_id in range(num_docs): |
| source = random.choice(SOURCES) |
| template = random.choice(TEMPLATES) |
| content = template.format(source=source, question=question, answer=answer) |
| is_corrupt = doc_id in corrupt_set |
|
|
| if is_corrupt: |
| level = min(corrupt_order[doc_id], 4) |
| content = corrupt_text(content, answer, level) |
|
|
| documents.append( |
| { |
| "id": doc_id, |
| "title": f"{source} Document {doc_id + 1}", |
| "content": content, |
| "is_corrupt": is_corrupt, |
| } |
| ) |
|
|
| return documents |
|
|