ESG_IR_RAGbot / ingest.py
Corin1998's picture
Upload 7 files
81a064e verified
from __future__ import annotations
import json, pathlib
from typing import List, Dict, Tuple
import numpy as np
import faiss
from pypdf import PdfReader
import yaml
from openai_client import embed_texts
from guardrails import sanitize
# absolute paths
BASE_DIR = pathlib.Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data"
PDF_DIR = DATA_DIR / "pdf"
INDEX_DIR = DATA_DIR / "index"
META_PATH = INDEX_DIR / "meta.jsonl"
INDEX_PATH = INDEX_DIR / "faiss.index"
CFG = yaml.safe_load(open(BASE_DIR / "config.yaml", encoding="utf-8"))
EMB_MODEL = CFG["embedding_model"]
NORMALIZE = CFG.get("normalize_embeddings", True)
target_chars = CFG["chunk"]["target_chars"]
overlap_chars = CFG["chunk"]["overlap_chars"]
def read_pdf_with_pages(path: str):
pages = []
reader = PdfReader(path)
for i, p in enumerate(reader.pages):
txt = p.extract_text() or ""
txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip())
pages.append((i + 1, txt))
return pages
def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int):
chunks = []
for page, text in pages:
if not text:
continue
start = 0
while start < len(text):
end = min(len(text), start + target_chars)
chunk = text[start:end]
if len(chunk.strip()) >= 50:
chunks.append({"page": page, "text": chunk})
start = end - overlap_chars if end - overlap_chars > 0 else end
return chunks
def l2_normalize(m: np.ndarray) -> np.ndarray:
if not NORMALIZE:
return m
norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12
return m / norms
def build_index():
INDEX_DIR.mkdir(parents=True, exist_ok=True)
with open(META_PATH, "w", encoding="utf-8") as meta_f:
texts: List[str] = []
for pdf in sorted(PDF_DIR.glob("*.pdf")):
print(f"Processing {pdf.name}...")
pages = read_pdf_with_pages(str(pdf))
chunks = split_chunks(pages, target_chars, overlap_chars)
for c in chunks:
t = c["text"][:1800]
texts.append(t)
meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
if not META_PATH.exists() or META_PATH.stat().st_size == 0:
raise SystemExit("Put PDFs under data/pdf/")
# embed
vecs = embed_texts(texts, EMB_MODEL)
mat = np.array(vecs, dtype="float32")
mat = l2_normalize(mat)
index = faiss.IndexFlatIP(mat.shape[1]) # cosine via normalized dot
index.add(mat)
faiss.write_index(index, str(INDEX_PATH))
print(f"Index {len(texts)} chunks → {INDEX_PATH}")
if __name__ == "__main__":
build_index()