diabetesLLM / core /documents.py
KS00Max's picture
first commit
f27bb68
from __future__ import annotations
import re
from pathlib import Path
from typing import Dict, Tuple
from .config import DATA_DIR
from .models import PageMeta, Passage, Section
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def _chunk_text(text: str, chunk_size: int = 700, overlap: int = 120):
text = _normalize_text(text)
if not text:
return []
chunks = []
start = 0
length = len(text)
while start < length:
end = min(start + chunk_size, length)
chunks.append(text[start:end])
if end == length:
break
start = max(end - overlap, 0)
return chunks
def load_user_docs(data_dir: Path | None = None) -> Tuple[Dict[str, Section], Dict[str, PageMeta], Dict[str, Passage]]:
base_dir = data_dir or (DATA_DIR / "docs")
sections: Dict[str, Section] = {}
pages: Dict[str, PageMeta] = {}
passages: Dict[str, Passage] = {}
if not base_dir.exists():
return sections, pages, passages
for path in sorted(base_dir.glob("*.md")):
raw_text = path.read_text(encoding="utf-8", errors="ignore")
doc_id = path.stem.lower().replace(" ", "_")
section_id = f"doc::{doc_id}"
page_id = f"doc_{doc_id}"
title = f"外部資料: {path.name}"
section = Section(section_id=section_id, title=title, parent_id=None)
sections[section_id] = section
summary = _normalize_text(raw_text[:300])
pages[page_id] = PageMeta(
page_id=page_id,
guideline_id="user_docs",
section_id=section_id,
title=title,
summary=summary or "外部資料の抜粋",
intent_ids=[],
)
for idx, chunk in enumerate(_chunk_text(raw_text)):
passage_id = f"{page_id}_p{idx+1:03d}"
passages[passage_id] = Passage(
passage_id=passage_id,
guideline_id="user_docs",
page_id=page_id,
section_id=section_id,
order_in_section=idx + 1,
text=chunk,
source_page=idx + 1,
source_lines=[],
tags=[path.name],
)
return sections, pages, passages