Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import logging | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| import yaml | |
| from .config import DATA_DIR | |
| from .models import ClarifyingChoice, ClarifyingQuestion, IntentMeta, PageMeta, Passage, Section | |
| logger = logging.getLogger(__name__) | |
| def _load_yaml(path: Path): | |
| if not path.exists(): | |
| raise FileNotFoundError(f"Required data file not found: {path}") | |
| with path.open("r", encoding="utf-8") as f: | |
| return yaml.safe_load(f) or [] | |
| def load_sections(data_dir: Path = DATA_DIR) -> Dict[str, Section]: | |
| sections: Dict[str, Section] = {} | |
| for row in _load_yaml(data_dir / "sections.yaml"): | |
| sec = Section( | |
| section_id=str(row["section_id"]), | |
| title=str(row["title"]), | |
| parent_id=row.get("parent_id"), | |
| ) | |
| if sec.section_id in sections: | |
| logger.warning("Duplicate section id found: %s", sec.section_id) | |
| sections[sec.section_id] = sec | |
| return sections | |
| def load_page_index(data_dir: Path = DATA_DIR) -> Dict[str, PageMeta]: | |
| pages: Dict[str, PageMeta] = {} | |
| for row in _load_yaml(data_dir / "page_index.yaml"): | |
| page = PageMeta( | |
| page_id=str(row["page_id"]), | |
| guideline_id=str(row["guideline_id"]), | |
| section_id=str(row["section_id"]), | |
| title=str(row["title"]), | |
| summary=str(row["summary"]), | |
| intent_ids=list(row.get("intent_ids", [])), | |
| ) | |
| if page.page_id in pages: | |
| logger.warning("Duplicate page id found: %s", page.page_id) | |
| pages[page.page_id] = page | |
| return pages | |
| def load_passages(data_dir: Path = DATA_DIR) -> Dict[str, Passage]: | |
| passages: Dict[str, Passage] = {} | |
| for row in _load_yaml(data_dir / "passages.yaml"): | |
| passage = Passage( | |
| passage_id=str(row["passage_id"]), | |
| guideline_id=str(row["guideline_id"]), | |
| page_id=str(row["page_id"]), | |
| section_id=str(row["section_id"]), | |
| order_in_section=int(row["order_in_section"]), | |
| text=str(row["text"]), | |
| source_page=int(row["source_page"]), | |
| source_lines=list(row.get("source_lines", [])), | |
| tags=list(row.get("tags", [])), | |
| ) | |
| if passage.passage_id in passages: | |
| logger.warning("Duplicate passage id found: %s", passage.passage_id) | |
| passages[passage.passage_id] = passage | |
| return passages | |
| def load_intents(data_dir: Path = DATA_DIR) -> Dict[str, IntentMeta]: | |
| intents: Dict[str, IntentMeta] = {} | |
| for row in _load_yaml(data_dir / "intents.yaml"): | |
| intent = IntentMeta( | |
| intent_id=str(row["intent_id"]), | |
| name=str(row["name"]), | |
| description=str(row["description"]), | |
| topic_group=str(row["topic_group"]), | |
| guideline_ids=list(row.get("guideline_ids", [])), | |
| primary_section_ids=list(row.get("primary_section_ids", [])), | |
| example_questions=list(row.get("example_questions", [])), | |
| ) | |
| if intent.intent_id in intents: | |
| logger.warning("Duplicate intent id found: %s", intent.intent_id) | |
| intents[intent.intent_id] = intent | |
| return intents | |
| def load_clarifying(path: Path) -> List[ClarifyingQuestion]: | |
| questions: List[ClarifyingQuestion] = [] | |
| for row in _load_yaml(path): | |
| choices = [ | |
| ClarifyingChoice( | |
| id=str(choice["id"]), | |
| text=str(choice["text"]), | |
| update_state=dict(choice.get("update_state", {})), | |
| ) | |
| for choice in row.get("choices", []) | |
| ] | |
| questions.append( | |
| ClarifyingQuestion( | |
| question_id=str(row["question_id"]), | |
| applies_to_topic_groups=list(row.get("applies_to_topic_groups", [])), | |
| text=str(row["text"]), | |
| choices=choices, | |
| ) | |
| ) | |
| return questions | |
| def load_state_schema(data_dir: Path = DATA_DIR) -> Tuple[dict, dict]: | |
| """Return (schema, defaults) where defaults fill enums with first entry and bool with False.""" | |
| schema_path = data_dir / "state_schema.yaml" | |
| raw = _load_yaml(schema_path) | |
| data = raw[0] if isinstance(raw, list) and raw else raw | |
| fields = data.get("fields", {}) if isinstance(data, dict) else {} | |
| defaults = {} | |
| for key, spec in fields.items(): | |
| if spec.get("type") == "bool": | |
| defaults[key] = None | |
| elif spec.get("type") == "str": | |
| enum = spec.get("enum") | |
| defaults[key] = enum[0] if enum else "" | |
| else: | |
| defaults[key] = None | |
| return fields, defaults | |