Spaces:

AKMESSI
/

archive-explorer

Sleeping

File size: 4,738 Bytes

d0a567e

import os
import sqlite3
import lancedb
import PyPDF2
from sentence_transformers import SentenceTransformer
from lancedb.pydantic import LanceModel, Vector
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# CONFIGURATION
DATA_DIR = "data"
DB_NAME = "epstein.db"
VECTOR_DB_DIR = "data/lancedb"

print("Initializing models and databases...")

# 1. Setup SQLite (For Keyword Search)
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

# Create main table and FTS (Full Text Search) virtual table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS pages (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        filename TEXT,
        filepath TEXT,
        page_number INTEGER,
        text_content TEXT
    )
""")
cursor.execute("""
    CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
        filename,
        text_content,
        content='pages',
        content_rowid='id'
    )
""")
cursor.execute("""
    CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
        INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content);
    END;
""")
conn.commit()

# 2. Setup LanceDB (For Vector/AI Search)
model = SentenceTransformer('all-MiniLM-L6-v2')
ldb = lancedb.connect(VECTOR_DB_DIR)

# --- THE FIX: Use Pydantic to define the Schema ---
class PageSchema(LanceModel):
    vector: Vector(384) # 384 is the dimension of all-MiniLM-L6-v2
    text: str
    filename: str
    page_number: int
    filepath: str

# Create or Open the table using the Class Schema
try:
    tbl = ldb.open_table("pages")
except:
    tbl = ldb.create_table("pages", schema=PageSchema)
# --------------------------------------------------

def chunk_text(text, chunk_size=500):
    """Split long page text into smaller chunks for better vector search"""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        current_length += len(word) + 1
        current_chunk.append(word)
        if current_length >= chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
            
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def process_pdf(filepath):
    filename = os.path.basename(filepath)
    print(f"Processing: {filename}...")
    
    try:
        with open(filepath, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            num_pages = len(reader.pages)
            
            for i in range(num_pages):
                try:
                    page = reader.pages[i]
                    text = page.extract_text()
                    
                    # Junk Filter: Skip pages with too little text
                    if not text or len(text.strip()) < 50:
                        continue
                        
                    clean_text = text.replace('\x00', '') # Remove null bytes
                    
                    # A. Insert into SQLite (Keyword Search)
                    cursor.execute(
                        "INSERT INTO pages (filename, filepath, page_number, text_content) VALUES (?, ?, ?, ?)",
                        (filename, filepath, i + 1, clean_text)
                    )
                    
                    # B. Insert into LanceDB (Vector Search)
                    chunks = chunk_text(clean_text)
                    vectors = model.encode(chunks)
                    
                    data_to_add = []
                    for chunk, vector in zip(chunks, vectors):
                        data_to_add.append({
                            "vector": vector,
                            "text": chunk,
                            "filename": filename,
                            "page_number": i + 1,
                            "filepath": filepath
                        })
                    
                    if data_to_add:
                        tbl.add(data_to_add)
                        
                except Exception as e:
                    print(f"  Error on page {i+1}: {e}")
                    
            conn.commit()
            
    except Exception as e:
        print(f"Failed to read {filename}: {e}")

def main():
    print(f"Scanning directory: {DATA_DIR}")
    pdf_count = 0
    
    for root, dirs, files in os.walk(DATA_DIR):
        for file in files:
            if file.lower().endswith('.pdf'):
                full_path = os.path.join(root, file)
                process_pdf(full_path)
                pdf_count += 1
                
    print(f"Done! Processed {pdf_count} PDF files.")
    print("Run 'python app.py' next to start the server.")

if __name__ == "__main__":
    main()