Spaces:
Sleeping
Sleeping
| import os | |
| import sqlite3 | |
| import lancedb | |
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| from lancedb.pydantic import LanceModel, Vector | |
| import warnings | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| # CONFIGURATION | |
| DATA_DIR = "data" | |
| DB_NAME = "epstein.db" | |
| VECTOR_DB_DIR = "data/lancedb" | |
| print("Initializing models and databases...") | |
| # 1. Setup SQLite (For Keyword Search) | |
| conn = sqlite3.connect(DB_NAME) | |
| cursor = conn.cursor() | |
| # Create main table and FTS (Full Text Search) virtual table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS pages ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| filename TEXT, | |
| filepath TEXT, | |
| page_number INTEGER, | |
| text_content TEXT | |
| ) | |
| """) | |
| cursor.execute(""" | |
| CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( | |
| filename, | |
| text_content, | |
| content='pages', | |
| content_rowid='id' | |
| ) | |
| """) | |
| cursor.execute(""" | |
| CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN | |
| INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content); | |
| END; | |
| """) | |
| conn.commit() | |
| # 2. Setup LanceDB (For Vector/AI Search) | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| ldb = lancedb.connect(VECTOR_DB_DIR) | |
| # --- THE FIX: Use Pydantic to define the Schema --- | |
| class PageSchema(LanceModel): | |
| vector: Vector(384) # 384 is the dimension of all-MiniLM-L6-v2 | |
| text: str | |
| filename: str | |
| page_number: int | |
| filepath: str | |
| # Create or Open the table using the Class Schema | |
| try: | |
| tbl = ldb.open_table("pages") | |
| except: | |
| tbl = ldb.create_table("pages", schema=PageSchema) | |
| # -------------------------------------------------- | |
| def chunk_text(text, chunk_size=500): | |
| """Split long page text into smaller chunks for better vector search""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| current_length += len(word) + 1 | |
| current_chunk.append(word) | |
| if current_length >= chunk_size: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| current_length = 0 | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def process_pdf(filepath): | |
| filename = os.path.basename(filepath) | |
| print(f"Processing: {filename}...") | |
| try: | |
| with open(filepath, 'rb') as f: | |
| reader = PyPDF2.PdfReader(f) | |
| num_pages = len(reader.pages) | |
| for i in range(num_pages): | |
| try: | |
| page = reader.pages[i] | |
| text = page.extract_text() | |
| # Junk Filter: Skip pages with too little text | |
| if not text or len(text.strip()) < 50: | |
| continue | |
| clean_text = text.replace('\x00', '') # Remove null bytes | |
| # A. Insert into SQLite (Keyword Search) | |
| cursor.execute( | |
| "INSERT INTO pages (filename, filepath, page_number, text_content) VALUES (?, ?, ?, ?)", | |
| (filename, filepath, i + 1, clean_text) | |
| ) | |
| # B. Insert into LanceDB (Vector Search) | |
| chunks = chunk_text(clean_text) | |
| vectors = model.encode(chunks) | |
| data_to_add = [] | |
| for chunk, vector in zip(chunks, vectors): | |
| data_to_add.append({ | |
| "vector": vector, | |
| "text": chunk, | |
| "filename": filename, | |
| "page_number": i + 1, | |
| "filepath": filepath | |
| }) | |
| if data_to_add: | |
| tbl.add(data_to_add) | |
| except Exception as e: | |
| print(f" Error on page {i+1}: {e}") | |
| conn.commit() | |
| except Exception as e: | |
| print(f"Failed to read {filename}: {e}") | |
| def main(): | |
| print(f"Scanning directory: {DATA_DIR}") | |
| pdf_count = 0 | |
| for root, dirs, files in os.walk(DATA_DIR): | |
| for file in files: | |
| if file.lower().endswith('.pdf'): | |
| full_path = os.path.join(root, file) | |
| process_pdf(full_path) | |
| pdf_count += 1 | |
| print(f"Done! Processed {pdf_count} PDF files.") | |
| print("Run 'python app.py' next to start the server.") | |
| if __name__ == "__main__": | |
| main() |