GirishaBuilds01's picture
Upload 8 files
1ffe118 verified
raw
history blame contribute delete
816 Bytes
from pathlib import Path
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import faiss, pickle
PDF_DIR = Path("data/pdfs")
INDEX_DIR = Path("index")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def ingest():
texts = []
for pdf in PDF_DIR.glob("*.pdf"):
reader = PdfReader(pdf)
for p in reader.pages:
t = p.extract_text()
if t:
texts.append(t)
if not texts:
return
emb = model.encode(texts, show_progress_bar=False)
index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)
INDEX_DIR.mkdir(exist_ok=True)
faiss.write_index(index, str(INDEX_DIR / "esg.faiss"))
pickle.dump(texts, open(INDEX_DIR / "docs.pkl", "wb"))
if __name__ == "__main__":
ingest()