Spaces:

Corin1998
/

IR_ESG_RAG_Bot

Runtime error

File size: 2,793 Bytes

from __future__ import annotations
import json, pathlib
from typing import List, Dict, Tuple

import numpy as np
import faiss
from pypdf import PdfReader
import yaml

from openai_client import embed_texts
from guardrails import sanitize

CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
EMB_MODEL = CFG["embedding_model"]
NORMALIZE = CFG.get("normalize_embeddings", True)

DATA_DIR = pathlib.Path("data")
PDF_DIR = DATA_DIR / "pdf"
INDEX_DIR = DATA_DIR / "index"
META_PATH = INDEX_DIR / "meta.jsonl"      # app.py と一致
INDEX_PATH = INDEX_DIR / "faiss.index"

def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
    pages: List[Tuple[int, str]] = []
    reader = PdfReader(path)
    for i, p in enumerate(reader.pages):
        txt = p.extract_text() or ""
        txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip())
        pages.append((i + 1, txt))
    return pages

def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]:
    chunks: List[Dict] = []
    for page, text in pages:
        if not text:
            continue
        start = 0
        while start < len(text):
            end = min(len(text), start + target_chars)
            chunk = text[start:end]
            if len(chunk.strip()) >= 50:
                chunks.append({"page": page, "text": chunk})
            start = end - overlap_chars if end - overlap_chars > 0 else end
    return chunks

def l2_normalize(m: np.ndarray) -> np.ndarray:
    if not NORMALIZE:
        return m
    norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12
    return m / norms

def build_index():
    INDEX_DIR.mkdir(parents=True, exist_ok=True)
    meta_f = open(META_PATH, "w", encoding="utf-8")

    target_chars = CFG["chunk"]["target_chars"]
    overlap_chars = CFG["chunk"]["overlap_chars"]

    texts: List[str] = []
    for pdf in sorted(PDF_DIR.glob("*.pdf")):
        print(f"Processing {pdf.name}...")
        pages = read_pdf_with_pages(str(pdf))
        chunks = split_chunks(pages, target_chars, overlap_chars)
        for c in chunks:
            t = c["text"][:1800]
            texts.append(t)
            meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
            meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")

    meta_f.close()

    if not texts:
        raise SystemExit("Put PDFs under data/pdf/")

    vecs = embed_texts(texts, EMB_MODEL)
    mat = np.array(vecs, dtype="float32")
    mat = l2_normalize(mat)

    # コサイン類似（正規化済みベクトル × 内積）
    index = faiss.IndexFlatIP(mat.shape[1])
    index.add(mat)
    faiss.write_index(index, str(INDEX_PATH))
    print(f"Index {len(texts)} chunks → {INDEX_PATH}")

if __name__ == "__main__":
    build_index()