File size: 2,367 Bytes
204fa23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import random
from typing import Any

from data.corruption import corrupt_text


SOURCES = [
    "Encyclopedia Britannica",
    "Reuters Fact Check",
    "National Geographic",
    "Smithsonian Magazine",
    "BBC Reference Desk",
    "Oxford Reference",
    "World Almanac",
    "Associated Press Archive",
    "Library of Congress Notes",
    "Academic Knowledge Base",
]

TEMPLATES = [
    "{source} summarizes the question '{question}' and identifies the answer as {answer}.",
    "In its reference entry, {source} states that the correct answer to '{question}' is {answer}.",
    "{source} records {answer} as the accepted answer when asked: '{question}'",
    "A background note from {source} explains that {answer} is the established response to '{question}'",
    "According to {source}, researchers commonly answer '{question}' with {answer}.",
    "{source} lists the verified answer for '{question}' as {answer}, matching standard references.",
    "The archive maintained by {source} gives {answer} as the answer to '{question}'",
    "For the prompt '{question}', {source} reports that the answer is {answer}.",
]


def _as_text(value: Any, default: str = "") -> str:
    if value is None:
        return default
    text = str(value).strip()
    return text or default


def generate_documents(
    fact: dict[str, Any],
    num_docs: int = 8,
    corrupt_positions: list[int] | None = None,
) -> list[dict[str, Any]]:
    question = _as_text(fact.get("question"), "Unknown question?")
    answer = _as_text(fact.get("answer"), "unknown")
    corrupt_set = set(corrupt_positions or [])
    corrupt_order = {doc_id: idx + 1 for idx, doc_id in enumerate(corrupt_positions or [])}

    documents: list[dict[str, Any]] = []
    for doc_id in range(num_docs):
        source = random.choice(SOURCES)
        template = random.choice(TEMPLATES)
        content = template.format(source=source, question=question, answer=answer)
        is_corrupt = doc_id in corrupt_set

        if is_corrupt:
            level = min(corrupt_order[doc_id], 4)
            content = corrupt_text(content, answer, level)

        documents.append(
            {
                "id": doc_id,
                "title": f"{source} Document {doc_id + 1}",
                "content": content,
                "is_corrupt": is_corrupt,
            }
        )

    return documents