Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| from pathlib import Path | |
| from typing import Dict, Tuple | |
| from .config import DATA_DIR | |
| from .models import PageMeta, Passage, Section | |
| def _normalize_text(text: str) -> str: | |
| return re.sub(r"\s+", " ", text).strip() | |
| def _chunk_text(text: str, chunk_size: int = 700, overlap: int = 120): | |
| text = _normalize_text(text) | |
| if not text: | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| length = len(text) | |
| while start < length: | |
| end = min(start + chunk_size, length) | |
| chunks.append(text[start:end]) | |
| if end == length: | |
| break | |
| start = max(end - overlap, 0) | |
| return chunks | |
| def load_user_docs(data_dir: Path | None = None) -> Tuple[Dict[str, Section], Dict[str, PageMeta], Dict[str, Passage]]: | |
| base_dir = data_dir or (DATA_DIR / "docs") | |
| sections: Dict[str, Section] = {} | |
| pages: Dict[str, PageMeta] = {} | |
| passages: Dict[str, Passage] = {} | |
| if not base_dir.exists(): | |
| return sections, pages, passages | |
| for path in sorted(base_dir.glob("*.md")): | |
| raw_text = path.read_text(encoding="utf-8", errors="ignore") | |
| doc_id = path.stem.lower().replace(" ", "_") | |
| section_id = f"doc::{doc_id}" | |
| page_id = f"doc_{doc_id}" | |
| title = f"外部資料: {path.name}" | |
| section = Section(section_id=section_id, title=title, parent_id=None) | |
| sections[section_id] = section | |
| summary = _normalize_text(raw_text[:300]) | |
| pages[page_id] = PageMeta( | |
| page_id=page_id, | |
| guideline_id="user_docs", | |
| section_id=section_id, | |
| title=title, | |
| summary=summary or "外部資料の抜粋", | |
| intent_ids=[], | |
| ) | |
| for idx, chunk in enumerate(_chunk_text(raw_text)): | |
| passage_id = f"{page_id}_p{idx+1:03d}" | |
| passages[passage_id] = Passage( | |
| passage_id=passage_id, | |
| guideline_id="user_docs", | |
| page_id=page_id, | |
| section_id=section_id, | |
| order_in_section=idx + 1, | |
| text=chunk, | |
| source_page=idx + 1, | |
| source_lines=[], | |
| tags=[path.name], | |
| ) | |
| return sections, pages, passages | |