| |
|
| | import os
|
| | import json
|
| | import math
|
| | from pathlib import Path
|
| | from tqdm import tqdm
|
| |
|
| | import numpy as np
|
| | import pdfplumber
|
| | from sentence_transformers import SentenceTransformer
|
| | import faiss
|
| |
|
| |
|
| | DOCS_DIR = Path("docs")
|
| | DATA_DIR = Path("data")
|
| | EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| | CHUNK_CHAR_SIZE = 1000
|
| | CHUNK_OVERLAP = 200
|
| | EMBED_DIM = 384
|
| | BATCH_SIZE = 32
|
| | TOP_K = 5
|
| |
|
| |
|
| | DATA_DIR.mkdir(exist_ok=True)
|
| |
|
| | def extract_text_from_pdf(pdf_path: Path):
|
| | pages = []
|
| | with pdfplumber.open(pdf_path) as pdf:
|
| | for i, page in enumerate(pdf.pages):
|
| | text = page.extract_text() or ""
|
| | pages.append({"page_number": i+1, "text": text})
|
| | return pages
|
| |
|
| | def split_text_into_chunks(text, chunk_size=CHUNK_CHAR_SIZE, overlap=CHUNK_OVERLAP):
|
| | text = text.strip()
|
| | if not text:
|
| | return []
|
| | chunks = []
|
| | start = 0
|
| | text_len = len(text)
|
| | while start < text_len:
|
| | end = start + chunk_size
|
| |
|
| | if end < text_len:
|
| | snippet = text[start:end]
|
| |
|
| | cut = max(snippet.rfind('\n'), snippet.rfind('. '), snippet.rfind('? '), snippet.rfind('! '))
|
| | if cut != -1 and cut > int(chunk_size * 0.5):
|
| | end = start + cut + 1
|
| | chunk_text = text[start:end].strip()
|
| | if chunk_text:
|
| | chunks.append(chunk_text)
|
| | start = end - overlap
|
| | if start < 0:
|
| | start = 0
|
| | if end >= text_len:
|
| | break
|
| | return chunks
|
| |
|
| | def build_embeddings(model, texts):
|
| | embeddings = []
|
| | for i in range(0, len(texts), BATCH_SIZE):
|
| | batch = texts[i:i+BATCH_SIZE]
|
| | embs = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
|
| | embeddings.append(embs)
|
| | if embeddings:
|
| | return np.vstack(embeddings)
|
| | return np.empty((0, model.get_sentence_embedding_dimension()))
|
| |
|
| | def normalize_embeddings(embeddings: np.ndarray):
|
| |
|
| | faiss.normalize_L2(embeddings)
|
| | return embeddings
|
| |
|
| | def main():
|
| | model = SentenceTransformer(EMBED_MODEL)
|
| | EMBED_DIM_LOCAL = model.get_sentence_embedding_dimension()
|
| | print(f"Loaded embed model '{EMBED_MODEL}' with dim={EMBED_DIM_LOCAL}")
|
| |
|
| | all_text_chunks = []
|
| | metadata = []
|
| |
|
| | chunk_id = 0
|
| | pdf_files = list(DOCS_DIR.glob("*.pdf"))
|
| | if not pdf_files:
|
| | print("No PDF files found in docs/ — put your PDFs there and re-run.")
|
| | return
|
| |
|
| | for pdf_path in pdf_files:
|
| | print(f"Processing: {pdf_path.name}")
|
| | pages = extract_text_from_pdf(pdf_path)
|
| | for page in pages:
|
| | page_text = page["text"]
|
| | if not page_text:
|
| | continue
|
| | chunks = split_text_into_chunks(page_text)
|
| | for i, c in enumerate(chunks):
|
| | doc_meta = {
|
| | "chunk_id": chunk_id,
|
| | "source_file": pdf_path.name,
|
| | "page": page["page_number"],
|
| | "chunk_index_in_page": i,
|
| | "text": c[:1000]
|
| | }
|
| | metadata.append(doc_meta)
|
| | all_text_chunks.append(c)
|
| | chunk_id += 1
|
| |
|
| | if not all_text_chunks:
|
| | print("No text extracted from PDFs.")
|
| | return
|
| |
|
| | print(f"Total chunks: {len(all_text_chunks)}")
|
| |
|
| | embeddings = build_embeddings(model, all_text_chunks)
|
| | print("Embeddings shape:", embeddings.shape)
|
| |
|
| |
|
| | embeddings = normalize_embeddings(embeddings)
|
| |
|
| |
|
| | index = faiss.IndexFlatIP(EMBED_DIM_LOCAL)
|
| | index.add(embeddings.astype('float32'))
|
| | print("FAISS index built. n_total:", index.ntotal)
|
| |
|
| |
|
| | index_path = DATA_DIR / "vector_store.index"
|
| | faiss.write_index(index, str(index_path))
|
| | meta_path = DATA_DIR / "metadata.json"
|
| | with open(meta_path, "w", encoding="utf-8") as f:
|
| | json.dump(metadata, f, ensure_ascii=False, indent=2)
|
| |
|
| | print(f"Saved FAISS index -> {index_path}")
|
| | print(f"Saved metadata -> {meta_path}")
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|