Spaces:
Sleeping
Sleeping
File size: 2,268 Bytes
f27bb68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | from __future__ import annotations
import re
from pathlib import Path
from typing import Dict, Tuple
from .config import DATA_DIR
from .models import PageMeta, Passage, Section
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def _chunk_text(text: str, chunk_size: int = 700, overlap: int = 120):
text = _normalize_text(text)
if not text:
return []
chunks = []
start = 0
length = len(text)
while start < length:
end = min(start + chunk_size, length)
chunks.append(text[start:end])
if end == length:
break
start = max(end - overlap, 0)
return chunks
def load_user_docs(data_dir: Path | None = None) -> Tuple[Dict[str, Section], Dict[str, PageMeta], Dict[str, Passage]]:
base_dir = data_dir or (DATA_DIR / "docs")
sections: Dict[str, Section] = {}
pages: Dict[str, PageMeta] = {}
passages: Dict[str, Passage] = {}
if not base_dir.exists():
return sections, pages, passages
for path in sorted(base_dir.glob("*.md")):
raw_text = path.read_text(encoding="utf-8", errors="ignore")
doc_id = path.stem.lower().replace(" ", "_")
section_id = f"doc::{doc_id}"
page_id = f"doc_{doc_id}"
title = f"外部資料: {path.name}"
section = Section(section_id=section_id, title=title, parent_id=None)
sections[section_id] = section
summary = _normalize_text(raw_text[:300])
pages[page_id] = PageMeta(
page_id=page_id,
guideline_id="user_docs",
section_id=section_id,
title=title,
summary=summary or "外部資料の抜粋",
intent_ids=[],
)
for idx, chunk in enumerate(_chunk_text(raw_text)):
passage_id = f"{page_id}_p{idx+1:03d}"
passages[passage_id] = Passage(
passage_id=passage_id,
guideline_id="user_docs",
page_id=page_id,
section_id=section_id,
order_in_section=idx + 1,
text=chunk,
source_page=idx + 1,
source_lines=[],
tags=[path.name],
)
return sections, pages, passages
|