Spaces:

KS00Max
/

diabetesLLM

Sleeping

App Files Files Community

diabetesLLM / core /documents.py

KS00Max

first commit

f27bb68 2 months ago

raw

history blame contribute delete

2.27 kB

	from __future__ import annotations

	import re
	from pathlib import Path
	from typing import Dict, Tuple

	from .config import DATA_DIR
	from .models import PageMeta, Passage, Section


	def _normalize_text(text: str) -> str:
	return re.sub(r"\s+", " ", text).strip()


	def _chunk_text(text: str, chunk_size: int = 700, overlap: int = 120):
	text = _normalize_text(text)
	if not text:
	return []
	chunks = []
	start = 0
	length = len(text)
	while start < length:
	end = min(start + chunk_size, length)
	chunks.append(text[start:end])
	if end == length:
	break
	start = max(end - overlap, 0)
	return chunks


	def load_user_docs(data_dir: Path \| None = None) -> Tuple[Dict[str, Section], Dict[str, PageMeta], Dict[str, Passage]]:
	base_dir = data_dir or (DATA_DIR / "docs")
	sections: Dict[str, Section] = {}
	pages: Dict[str, PageMeta] = {}
	passages: Dict[str, Passage] = {}

	if not base_dir.exists():
	return sections, pages, passages

	for path in sorted(base_dir.glob("*.md")):
	raw_text = path.read_text(encoding="utf-8", errors="ignore")
	doc_id = path.stem.lower().replace(" ", "_")
	section_id = f"doc::{doc_id}"
	page_id = f"doc_{doc_id}"
	title = f"外部資料: {path.name}"
	section = Section(section_id=section_id, title=title, parent_id=None)
	sections[section_id] = section

	summary = _normalize_text(raw_text[:300])
	pages[page_id] = PageMeta(
	page_id=page_id,
	guideline_id="user_docs",
	section_id=section_id,
	title=title,
	summary=summary or "外部資料の抜粋",
	intent_ids=[],
	)

	for idx, chunk in enumerate(_chunk_text(raw_text)):
	passage_id = f"{page_id}_p{idx+1:03d}"
	passages[passage_id] = Passage(
	passage_id=passage_id,
	guideline_id="user_docs",
	page_id=page_id,
	section_id=section_id,
	order_in_section=idx + 1,
	text=chunk,
	source_page=idx + 1,
	source_lines=[],
	tags=[path.name],
	)

	return sections, pages, passages