from __future__ import annotations import re from pathlib import Path from typing import Dict, Tuple from .config import DATA_DIR from .models import PageMeta, Passage, Section def _normalize_text(text: str) -> str: return re.sub(r"\s+", " ", text).strip() def _chunk_text(text: str, chunk_size: int = 700, overlap: int = 120): text = _normalize_text(text) if not text: return [] chunks = [] start = 0 length = len(text) while start < length: end = min(start + chunk_size, length) chunks.append(text[start:end]) if end == length: break start = max(end - overlap, 0) return chunks def load_user_docs(data_dir: Path | None = None) -> Tuple[Dict[str, Section], Dict[str, PageMeta], Dict[str, Passage]]: base_dir = data_dir or (DATA_DIR / "docs") sections: Dict[str, Section] = {} pages: Dict[str, PageMeta] = {} passages: Dict[str, Passage] = {} if not base_dir.exists(): return sections, pages, passages for path in sorted(base_dir.glob("*.md")): raw_text = path.read_text(encoding="utf-8", errors="ignore") doc_id = path.stem.lower().replace(" ", "_") section_id = f"doc::{doc_id}" page_id = f"doc_{doc_id}" title = f"外部資料: {path.name}" section = Section(section_id=section_id, title=title, parent_id=None) sections[section_id] = section summary = _normalize_text(raw_text[:300]) pages[page_id] = PageMeta( page_id=page_id, guideline_id="user_docs", section_id=section_id, title=title, summary=summary or "外部資料の抜粋", intent_ids=[], ) for idx, chunk in enumerate(_chunk_text(raw_text)): passage_id = f"{page_id}_p{idx+1:03d}" passages[passage_id] = Passage( passage_id=passage_id, guideline_id="user_docs", page_id=page_id, section_id=section_id, order_in_section=idx + 1, text=chunk, source_page=idx + 1, source_lines=[], tags=[path.name], ) return sections, pages, passages