Spaces:

Corin1998
/

IR_ESG_RAG_Bot

Runtime error

App Files Files Community

IR_ESG_RAG_Bot / ingest.py

Corin1998

Upload 8 files

f31c318 verified 6 months ago

raw

history blame contribute delete

2.79 kB

	from __future__ import annotations
	import json, pathlib
	from typing import List, Dict, Tuple

	import numpy as np
	import faiss
	from pypdf import PdfReader
	import yaml

	from openai_client import embed_texts
	from guardrails import sanitize

	CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
	EMB_MODEL = CFG["embedding_model"]
	NORMALIZE = CFG.get("normalize_embeddings", True)

	DATA_DIR = pathlib.Path("data")
	PDF_DIR = DATA_DIR / "pdf"
	INDEX_DIR = DATA_DIR / "index"
	META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致
	INDEX_PATH = INDEX_DIR / "faiss.index"

	def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
	pages: List[Tuple[int, str]] = []
	reader = PdfReader(path)
	for i, p in enumerate(reader.pages):
	txt = p.extract_text() or ""
	txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip())
	pages.append((i + 1, txt))
	return pages

	def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]:
	chunks: List[Dict] = []
	for page, text in pages:
	if not text:
	continue
	start = 0
	while start < len(text):
	end = min(len(text), start + target_chars)
	chunk = text[start:end]
	if len(chunk.strip()) >= 50:
	chunks.append({"page": page, "text": chunk})
	start = end - overlap_chars if end - overlap_chars > 0 else end
	return chunks

	def l2_normalize(m: np.ndarray) -> np.ndarray:
	if not NORMALIZE:
	return m
	norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12
	return m / norms

	def build_index():
	INDEX_DIR.mkdir(parents=True, exist_ok=True)
	meta_f = open(META_PATH, "w", encoding="utf-8")

	target_chars = CFG["chunk"]["target_chars"]
	overlap_chars = CFG["chunk"]["overlap_chars"]

	texts: List[str] = []
	for pdf in sorted(PDF_DIR.glob("*.pdf")):
	print(f"Processing {pdf.name}...")
	pages = read_pdf_with_pages(str(pdf))
	chunks = split_chunks(pages, target_chars, overlap_chars)
	for c in chunks:
	t = c["text"][:1800]
	texts.append(t)
	meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
	meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")

	meta_f.close()

	if not texts:
	raise SystemExit("Put PDFs under data/pdf/")

	vecs = embed_texts(texts, EMB_MODEL)
	mat = np.array(vecs, dtype="float32")
	mat = l2_normalize(mat)

	# コサイン類似（正規化済みベクトル × 内積）
	index = faiss.IndexFlatIP(mat.shape[1])
	index.add(mat)
	faiss.write_index(index, str(INDEX_PATH))
	print(f"Index {len(texts)} chunks → {INDEX_PATH}")

	if __name__ == "__main__":
	build_index()