File size: 2,268 Bytes
f27bb68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from __future__ import annotations

import re
from pathlib import Path
from typing import Dict, Tuple

from .config import DATA_DIR
from .models import PageMeta, Passage, Section


def _normalize_text(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def _chunk_text(text: str, chunk_size: int = 700, overlap: int = 120):
    text = _normalize_text(text)
    if not text:
        return []
    chunks = []
    start = 0
    length = len(text)
    while start < length:
        end = min(start + chunk_size, length)
        chunks.append(text[start:end])
        if end == length:
            break
        start = max(end - overlap, 0)
    return chunks


def load_user_docs(data_dir: Path | None = None) -> Tuple[Dict[str, Section], Dict[str, PageMeta], Dict[str, Passage]]:
    base_dir = data_dir or (DATA_DIR / "docs")
    sections: Dict[str, Section] = {}
    pages: Dict[str, PageMeta] = {}
    passages: Dict[str, Passage] = {}

    if not base_dir.exists():
        return sections, pages, passages

    for path in sorted(base_dir.glob("*.md")):
        raw_text = path.read_text(encoding="utf-8", errors="ignore")
        doc_id = path.stem.lower().replace(" ", "_")
        section_id = f"doc::{doc_id}"
        page_id = f"doc_{doc_id}"
        title = f"外部資料: {path.name}"
        section = Section(section_id=section_id, title=title, parent_id=None)
        sections[section_id] = section

        summary = _normalize_text(raw_text[:300])
        pages[page_id] = PageMeta(
            page_id=page_id,
            guideline_id="user_docs",
            section_id=section_id,
            title=title,
            summary=summary or "外部資料の抜粋",
            intent_ids=[],
        )

        for idx, chunk in enumerate(_chunk_text(raw_text)):
            passage_id = f"{page_id}_p{idx+1:03d}"
            passages[passage_id] = Passage(
                passage_id=passage_id,
                guideline_id="user_docs",
                page_id=page_id,
                section_id=section_id,
                order_in_section=idx + 1,
                text=chunk,
                source_page=idx + 1,
                source_lines=[],
                tags=[path.name],
            )

    return sections, pages, passages