import os import sqlite3 import lancedb import PyPDF2 from sentence_transformers import SentenceTransformer from lancedb.pydantic import LanceModel, Vector import warnings # Suppress warnings warnings.filterwarnings("ignore") # CONFIGURATION DATA_DIR = "data" DB_NAME = "epstein.db" VECTOR_DB_DIR = "data/lancedb" print("Initializing models and databases...") # 1. Setup SQLite (For Keyword Search) conn = sqlite3.connect(DB_NAME) cursor = conn.cursor() # Create main table and FTS (Full Text Search) virtual table cursor.execute(""" CREATE TABLE IF NOT EXISTS pages ( id INTEGER PRIMARY KEY AUTOINCREMENT, filename TEXT, filepath TEXT, page_number INTEGER, text_content TEXT ) """) cursor.execute(""" CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( filename, text_content, content='pages', content_rowid='id' ) """) cursor.execute(""" CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content); END; """) conn.commit() # 2. Setup LanceDB (For Vector/AI Search) model = SentenceTransformer('all-MiniLM-L6-v2') ldb = lancedb.connect(VECTOR_DB_DIR) # --- THE FIX: Use Pydantic to define the Schema --- class PageSchema(LanceModel): vector: Vector(384) # 384 is the dimension of all-MiniLM-L6-v2 text: str filename: str page_number: int filepath: str # Create or Open the table using the Class Schema try: tbl = ldb.open_table("pages") except: tbl = ldb.create_table("pages", schema=PageSchema) # -------------------------------------------------- def chunk_text(text, chunk_size=500): """Split long page text into smaller chunks for better vector search""" words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: current_length += len(word) + 1 current_chunk.append(word) if current_length >= chunk_size: chunks.append(" ".join(current_chunk)) current_chunk = [] current_length = 0 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def process_pdf(filepath): filename = os.path.basename(filepath) print(f"Processing: {filename}...") try: with open(filepath, 'rb') as f: reader = PyPDF2.PdfReader(f) num_pages = len(reader.pages) for i in range(num_pages): try: page = reader.pages[i] text = page.extract_text() # Junk Filter: Skip pages with too little text if not text or len(text.strip()) < 50: continue clean_text = text.replace('\x00', '') # Remove null bytes # A. Insert into SQLite (Keyword Search) cursor.execute( "INSERT INTO pages (filename, filepath, page_number, text_content) VALUES (?, ?, ?, ?)", (filename, filepath, i + 1, clean_text) ) # B. Insert into LanceDB (Vector Search) chunks = chunk_text(clean_text) vectors = model.encode(chunks) data_to_add = [] for chunk, vector in zip(chunks, vectors): data_to_add.append({ "vector": vector, "text": chunk, "filename": filename, "page_number": i + 1, "filepath": filepath }) if data_to_add: tbl.add(data_to_add) except Exception as e: print(f" Error on page {i+1}: {e}") conn.commit() except Exception as e: print(f"Failed to read {filename}: {e}") def main(): print(f"Scanning directory: {DATA_DIR}") pdf_count = 0 for root, dirs, files in os.walk(DATA_DIR): for file in files: if file.lower().endswith('.pdf'): full_path = os.path.join(root, file) process_pdf(full_path) pdf_count += 1 print(f"Done! Processed {pdf_count} PDF files.") print("Run 'python app.py' next to start the server.") if __name__ == "__main__": main()