File size: 2,793 Bytes
d17f367
 
 
852a240
 
 
 
 
 
 
 
 
d17f367
852a240
 
 
 
 
 
f31c318
852a240
 
d17f367
 
852a240
 
d17f367
 
 
852a240
 
d17f367
 
 
 
 
 
 
 
f31c318
d17f367
 
 
 
 
 
852a240
 
d17f367
852a240
 
 
 
 
 
d17f367
852a240
 
d17f367
852a240
 
d17f367
852a240
 
d17f367
852a240
d17f367
852a240
 
d17f367
 
 
 
852a240
d17f367
 
 
852a240
d17f367
 
 
 
 
852a240
 
d17f367
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import annotations
import json, pathlib
from typing import List, Dict, Tuple

import numpy as np
import faiss
from pypdf import PdfReader
import yaml

from openai_client import embed_texts
from guardrails import sanitize

CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
EMB_MODEL = CFG["embedding_model"]
NORMALIZE = CFG.get("normalize_embeddings", True)

DATA_DIR = pathlib.Path("data")
PDF_DIR = DATA_DIR / "pdf"
INDEX_DIR = DATA_DIR / "index"
META_PATH = INDEX_DIR / "meta.jsonl"      # app.py と一致
INDEX_PATH = INDEX_DIR / "faiss.index"

def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
    pages: List[Tuple[int, str]] = []
    reader = PdfReader(path)
    for i, p in enumerate(reader.pages):
        txt = p.extract_text() or ""
        txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip())
        pages.append((i + 1, txt))
    return pages

def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]:
    chunks: List[Dict] = []
    for page, text in pages:
        if not text:
            continue
        start = 0
        while start < len(text):
            end = min(len(text), start + target_chars)
            chunk = text[start:end]
            if len(chunk.strip()) >= 50:
                chunks.append({"page": page, "text": chunk})
            start = end - overlap_chars if end - overlap_chars > 0 else end
    return chunks

def l2_normalize(m: np.ndarray) -> np.ndarray:
    if not NORMALIZE:
        return m
    norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12
    return m / norms

def build_index():
    INDEX_DIR.mkdir(parents=True, exist_ok=True)
    meta_f = open(META_PATH, "w", encoding="utf-8")

    target_chars = CFG["chunk"]["target_chars"]
    overlap_chars = CFG["chunk"]["overlap_chars"]

    texts: List[str] = []
    for pdf in sorted(PDF_DIR.glob("*.pdf")):
        print(f"Processing {pdf.name}...")
        pages = read_pdf_with_pages(str(pdf))
        chunks = split_chunks(pages, target_chars, overlap_chars)
        for c in chunks:
            t = c["text"][:1800]
            texts.append(t)
            meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
            meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")

    meta_f.close()

    if not texts:
        raise SystemExit("Put PDFs under data/pdf/")

    vecs = embed_texts(texts, EMB_MODEL)
    mat = np.array(vecs, dtype="float32")
    mat = l2_normalize(mat)

    # コサイン類似(正規化済みベクトル × 内積)
    index = faiss.IndexFlatIP(mat.shape[1])
    index.add(mat)
    faiss.write_index(index, str(INDEX_PATH))
    print(f"Index {len(texts)} chunks → {INDEX_PATH}")

if __name__ == "__main__":
    build_index()