Spaces:

KS00Max
/

diabetesLLM

Sleeping

File size: 4,672 Bytes

f27bb68

from __future__ import annotations

import logging
from pathlib import Path
from typing import Dict, List, Tuple

import yaml

from .config import DATA_DIR
from .models import ClarifyingChoice, ClarifyingQuestion, IntentMeta, PageMeta, Passage, Section

logger = logging.getLogger(__name__)


def _load_yaml(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"Required data file not found: {path}")
    with path.open("r", encoding="utf-8") as f:
        return yaml.safe_load(f) or []


def load_sections(data_dir: Path = DATA_DIR) -> Dict[str, Section]:
    sections: Dict[str, Section] = {}
    for row in _load_yaml(data_dir / "sections.yaml"):
        sec = Section(
            section_id=str(row["section_id"]),
            title=str(row["title"]),
            parent_id=row.get("parent_id"),
        )
        if sec.section_id in sections:
            logger.warning("Duplicate section id found: %s", sec.section_id)
        sections[sec.section_id] = sec
    return sections


def load_page_index(data_dir: Path = DATA_DIR) -> Dict[str, PageMeta]:
    pages: Dict[str, PageMeta] = {}
    for row in _load_yaml(data_dir / "page_index.yaml"):
        page = PageMeta(
            page_id=str(row["page_id"]),
            guideline_id=str(row["guideline_id"]),
            section_id=str(row["section_id"]),
            title=str(row["title"]),
            summary=str(row["summary"]),
            intent_ids=list(row.get("intent_ids", [])),
        )
        if page.page_id in pages:
            logger.warning("Duplicate page id found: %s", page.page_id)
        pages[page.page_id] = page
    return pages


def load_passages(data_dir: Path = DATA_DIR) -> Dict[str, Passage]:
    passages: Dict[str, Passage] = {}
    for row in _load_yaml(data_dir / "passages.yaml"):
        passage = Passage(
            passage_id=str(row["passage_id"]),
            guideline_id=str(row["guideline_id"]),
            page_id=str(row["page_id"]),
            section_id=str(row["section_id"]),
            order_in_section=int(row["order_in_section"]),
            text=str(row["text"]),
            source_page=int(row["source_page"]),
            source_lines=list(row.get("source_lines", [])),
            tags=list(row.get("tags", [])),
        )
        if passage.passage_id in passages:
            logger.warning("Duplicate passage id found: %s", passage.passage_id)
        passages[passage.passage_id] = passage
    return passages


def load_intents(data_dir: Path = DATA_DIR) -> Dict[str, IntentMeta]:
    intents: Dict[str, IntentMeta] = {}
    for row in _load_yaml(data_dir / "intents.yaml"):
        intent = IntentMeta(
            intent_id=str(row["intent_id"]),
            name=str(row["name"]),
            description=str(row["description"]),
            topic_group=str(row["topic_group"]),
            guideline_ids=list(row.get("guideline_ids", [])),
            primary_section_ids=list(row.get("primary_section_ids", [])),
            example_questions=list(row.get("example_questions", [])),
        )
        if intent.intent_id in intents:
            logger.warning("Duplicate intent id found: %s", intent.intent_id)
        intents[intent.intent_id] = intent
    return intents


def load_clarifying(path: Path) -> List[ClarifyingQuestion]:
    questions: List[ClarifyingQuestion] = []
    for row in _load_yaml(path):
        choices = [
            ClarifyingChoice(
                id=str(choice["id"]),
                text=str(choice["text"]),
                update_state=dict(choice.get("update_state", {})),
            )
            for choice in row.get("choices", [])
        ]
        questions.append(
            ClarifyingQuestion(
                question_id=str(row["question_id"]),
                applies_to_topic_groups=list(row.get("applies_to_topic_groups", [])),
                text=str(row["text"]),
                choices=choices,
            )
        )
    return questions


def load_state_schema(data_dir: Path = DATA_DIR) -> Tuple[dict, dict]:
    """Return (schema, defaults) where defaults fill enums with first entry and bool with False."""
    schema_path = data_dir / "state_schema.yaml"
    raw = _load_yaml(schema_path)
    data = raw[0] if isinstance(raw, list) and raw else raw
    fields = data.get("fields", {}) if isinstance(data, dict) else {}
    defaults = {}
    for key, spec in fields.items():
        if spec.get("type") == "bool":
            defaults[key] = None
        elif spec.get("type") == "str":
            enum = spec.get("enum")
            defaults[key] = enum[0] if enum else ""
        else:
            defaults[key] = None
    return fields, defaults