Spaces:
Sleeping
Sleeping
File size: 4,672 Bytes
f27bb68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from __future__ import annotations
import logging
from pathlib import Path
from typing import Dict, List, Tuple
import yaml
from .config import DATA_DIR
from .models import ClarifyingChoice, ClarifyingQuestion, IntentMeta, PageMeta, Passage, Section
logger = logging.getLogger(__name__)
def _load_yaml(path: Path):
if not path.exists():
raise FileNotFoundError(f"Required data file not found: {path}")
with path.open("r", encoding="utf-8") as f:
return yaml.safe_load(f) or []
def load_sections(data_dir: Path = DATA_DIR) -> Dict[str, Section]:
sections: Dict[str, Section] = {}
for row in _load_yaml(data_dir / "sections.yaml"):
sec = Section(
section_id=str(row["section_id"]),
title=str(row["title"]),
parent_id=row.get("parent_id"),
)
if sec.section_id in sections:
logger.warning("Duplicate section id found: %s", sec.section_id)
sections[sec.section_id] = sec
return sections
def load_page_index(data_dir: Path = DATA_DIR) -> Dict[str, PageMeta]:
pages: Dict[str, PageMeta] = {}
for row in _load_yaml(data_dir / "page_index.yaml"):
page = PageMeta(
page_id=str(row["page_id"]),
guideline_id=str(row["guideline_id"]),
section_id=str(row["section_id"]),
title=str(row["title"]),
summary=str(row["summary"]),
intent_ids=list(row.get("intent_ids", [])),
)
if page.page_id in pages:
logger.warning("Duplicate page id found: %s", page.page_id)
pages[page.page_id] = page
return pages
def load_passages(data_dir: Path = DATA_DIR) -> Dict[str, Passage]:
passages: Dict[str, Passage] = {}
for row in _load_yaml(data_dir / "passages.yaml"):
passage = Passage(
passage_id=str(row["passage_id"]),
guideline_id=str(row["guideline_id"]),
page_id=str(row["page_id"]),
section_id=str(row["section_id"]),
order_in_section=int(row["order_in_section"]),
text=str(row["text"]),
source_page=int(row["source_page"]),
source_lines=list(row.get("source_lines", [])),
tags=list(row.get("tags", [])),
)
if passage.passage_id in passages:
logger.warning("Duplicate passage id found: %s", passage.passage_id)
passages[passage.passage_id] = passage
return passages
def load_intents(data_dir: Path = DATA_DIR) -> Dict[str, IntentMeta]:
intents: Dict[str, IntentMeta] = {}
for row in _load_yaml(data_dir / "intents.yaml"):
intent = IntentMeta(
intent_id=str(row["intent_id"]),
name=str(row["name"]),
description=str(row["description"]),
topic_group=str(row["topic_group"]),
guideline_ids=list(row.get("guideline_ids", [])),
primary_section_ids=list(row.get("primary_section_ids", [])),
example_questions=list(row.get("example_questions", [])),
)
if intent.intent_id in intents:
logger.warning("Duplicate intent id found: %s", intent.intent_id)
intents[intent.intent_id] = intent
return intents
def load_clarifying(path: Path) -> List[ClarifyingQuestion]:
questions: List[ClarifyingQuestion] = []
for row in _load_yaml(path):
choices = [
ClarifyingChoice(
id=str(choice["id"]),
text=str(choice["text"]),
update_state=dict(choice.get("update_state", {})),
)
for choice in row.get("choices", [])
]
questions.append(
ClarifyingQuestion(
question_id=str(row["question_id"]),
applies_to_topic_groups=list(row.get("applies_to_topic_groups", [])),
text=str(row["text"]),
choices=choices,
)
)
return questions
def load_state_schema(data_dir: Path = DATA_DIR) -> Tuple[dict, dict]:
"""Return (schema, defaults) where defaults fill enums with first entry and bool with False."""
schema_path = data_dir / "state_schema.yaml"
raw = _load_yaml(schema_path)
data = raw[0] if isinstance(raw, list) and raw else raw
fields = data.get("fields", {}) if isinstance(data, dict) else {}
defaults = {}
for key, spec in fields.items():
if spec.get("type") == "bool":
defaults[key] = None
elif spec.get("type") == "str":
enum = spec.get("enum")
defaults[key] = enum[0] if enum else ""
else:
defaults[key] = None
return fields, defaults
|