File size: 4,672 Bytes
f27bb68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from __future__ import annotations

import logging
from pathlib import Path
from typing import Dict, List, Tuple

import yaml

from .config import DATA_DIR
from .models import ClarifyingChoice, ClarifyingQuestion, IntentMeta, PageMeta, Passage, Section

logger = logging.getLogger(__name__)


def _load_yaml(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"Required data file not found: {path}")
    with path.open("r", encoding="utf-8") as f:
        return yaml.safe_load(f) or []


def load_sections(data_dir: Path = DATA_DIR) -> Dict[str, Section]:
    sections: Dict[str, Section] = {}
    for row in _load_yaml(data_dir / "sections.yaml"):
        sec = Section(
            section_id=str(row["section_id"]),
            title=str(row["title"]),
            parent_id=row.get("parent_id"),
        )
        if sec.section_id in sections:
            logger.warning("Duplicate section id found: %s", sec.section_id)
        sections[sec.section_id] = sec
    return sections


def load_page_index(data_dir: Path = DATA_DIR) -> Dict[str, PageMeta]:
    pages: Dict[str, PageMeta] = {}
    for row in _load_yaml(data_dir / "page_index.yaml"):
        page = PageMeta(
            page_id=str(row["page_id"]),
            guideline_id=str(row["guideline_id"]),
            section_id=str(row["section_id"]),
            title=str(row["title"]),
            summary=str(row["summary"]),
            intent_ids=list(row.get("intent_ids", [])),
        )
        if page.page_id in pages:
            logger.warning("Duplicate page id found: %s", page.page_id)
        pages[page.page_id] = page
    return pages


def load_passages(data_dir: Path = DATA_DIR) -> Dict[str, Passage]:
    passages: Dict[str, Passage] = {}
    for row in _load_yaml(data_dir / "passages.yaml"):
        passage = Passage(
            passage_id=str(row["passage_id"]),
            guideline_id=str(row["guideline_id"]),
            page_id=str(row["page_id"]),
            section_id=str(row["section_id"]),
            order_in_section=int(row["order_in_section"]),
            text=str(row["text"]),
            source_page=int(row["source_page"]),
            source_lines=list(row.get("source_lines", [])),
            tags=list(row.get("tags", [])),
        )
        if passage.passage_id in passages:
            logger.warning("Duplicate passage id found: %s", passage.passage_id)
        passages[passage.passage_id] = passage
    return passages


def load_intents(data_dir: Path = DATA_DIR) -> Dict[str, IntentMeta]:
    intents: Dict[str, IntentMeta] = {}
    for row in _load_yaml(data_dir / "intents.yaml"):
        intent = IntentMeta(
            intent_id=str(row["intent_id"]),
            name=str(row["name"]),
            description=str(row["description"]),
            topic_group=str(row["topic_group"]),
            guideline_ids=list(row.get("guideline_ids", [])),
            primary_section_ids=list(row.get("primary_section_ids", [])),
            example_questions=list(row.get("example_questions", [])),
        )
        if intent.intent_id in intents:
            logger.warning("Duplicate intent id found: %s", intent.intent_id)
        intents[intent.intent_id] = intent
    return intents


def load_clarifying(path: Path) -> List[ClarifyingQuestion]:
    questions: List[ClarifyingQuestion] = []
    for row in _load_yaml(path):
        choices = [
            ClarifyingChoice(
                id=str(choice["id"]),
                text=str(choice["text"]),
                update_state=dict(choice.get("update_state", {})),
            )
            for choice in row.get("choices", [])
        ]
        questions.append(
            ClarifyingQuestion(
                question_id=str(row["question_id"]),
                applies_to_topic_groups=list(row.get("applies_to_topic_groups", [])),
                text=str(row["text"]),
                choices=choices,
            )
        )
    return questions


def load_state_schema(data_dir: Path = DATA_DIR) -> Tuple[dict, dict]:
    """Return (schema, defaults) where defaults fill enums with first entry and bool with False."""
    schema_path = data_dir / "state_schema.yaml"
    raw = _load_yaml(schema_path)
    data = raw[0] if isinstance(raw, list) and raw else raw
    fields = data.get("fields", {}) if isinstance(data, dict) else {}
    defaults = {}
    for key, spec in fields.items():
        if spec.get("type") == "bool":
            defaults[key] = None
        elif spec.get("type") == "str":
            enum = spec.get("enum")
            defaults[key] = enum[0] if enum else ""
        else:
            defaults[key] = None
    return fields, defaults