import os import sqlite3 import lancedb from fastapi import FastAPI, Request, HTTPException from fastapi.responses import HTMLResponse, Response from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from sentence_transformers import SentenceTransformer import uvicorn import fitz # PyMuPDF from PIL import Image, ImageDraw, ImageFont import io import zipfile from huggingface_hub import hf_hub_download import numpy as np app = FastAPI() # --- CONFIGURATION & UNZIPPING --- print("📥 Downloading Data from Hugging Face Dataset...") # 1. Download the ZIP file zip_path = hf_hub_download( repo_id="AKMESSI/epstein-data", filename="data.zip", repo_type="dataset" ) # 2. Extract it (if not already extracted) DATA_DIR = "data" if not os.path.exists(DATA_DIR): print("📦 Extracting data.zip... (This takes a moment)") with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(".") # Extracts to current folder print("✅ Extraction Complete!") else: print("✅ Data already extracted.") # 3. Set DB Paths # The zip contains "data/", so we look inside it DB_NAME = "epstein.db" # This should ideally be uploaded separately if it's not in the zip # If your DB is inside the data folder, update this path: # DB_NAME = os.path.join(DATA_DIR, "epstein.db") VECTOR_DB_DIR = os.path.join(DATA_DIR, "lancedb") # --- DATABASE INITIALIZATION --- def init_db(): conn = sqlite3.connect(DB_NAME) cursor = conn.cursor() # 1. Main Pages cursor.execute(""" CREATE TABLE IF NOT EXISTS pages ( id INTEGER PRIMARY KEY AUTOINCREMENT, filename TEXT, filepath TEXT, page_number INTEGER, text_content TEXT ) """) # 2. FTS Virtual Table cursor.execute(""" CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( filename, text_content, content='pages', content_rowid='id' ) """) # 3. Triggers cursor.execute(""" CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content); END; """) # 4. Analytics cursor.execute(""" CREATE TABLE IF NOT EXISTS search_analytics ( term TEXT PRIMARY KEY, count INTEGER DEFAULT 1, last_searched TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) conn.commit() conn.close() init_db() # --- CONNECT TO DB HELPERS --- def get_db_connection(): conn = sqlite3.connect(DB_NAME) conn.row_factory = sqlite3.Row return conn # --- LOAD AI MODELS --- print("Loading Text AI Model...") text_model = SentenceTransformer('all-MiniLM-L6-v2') print("Loading Visual AI Model (CLIP)...") visual_model = SentenceTransformer('clip-ViT-B-32') # Connect to LanceDB ldb = lancedb.connect(VECTOR_DB_DIR) # Open Tables try: tbl = ldb.open_table("pages") # Text Vectors except: tbl = None try: visual_tbl = ldb.open_table("visuals") # Visual Vectors except: visual_tbl = None # --- TEMPLATES --- templates = Jinja2Templates(directory="templates") app.mount("/files", StaticFiles(directory=DATA_DIR), name="files") # --- ROUTES --- @app.get("/", response_class=HTMLResponse) async def home(request: Request): conn = get_db_connection() c = conn.cursor() try: c.execute("SELECT term, count FROM search_analytics ORDER BY count DESC LIMIT 5") trends = c.fetchall() except: trends = [] conn.close() return templates.TemplateResponse("index.html", {"request": request, "trends": trends}) @app.get("/search", response_class=HTMLResponse) async def search(request: Request, q: str, searchmode: str = "text"): if not q: return "" # 1. ANALYTICS (Keep existing) try: conn = get_db_connection() c = conn.cursor() c.execute(""" INSERT INTO search_analytics (term, count, last_searched) VALUES (?, 1, CURRENT_TIMESTAMP) ON CONFLICT(term) DO UPDATE SET count = count + 1, last_searched = CURRENT_TIMESTAMP """, (q.lower().strip(),)) conn.commit() conn.close() except: pass results = [] seen_files = set() # --- DEBUGGING: Check if DB is empty --- if searchmode == "visual" and visual_tbl: # Check total rows (Run this once to see in logs) print(f"🔍 Visual Index Size: {len(visual_tbl)} rows") # --- MODE 1: VISUAL SEARCH (Standard & Reliable) --- if searchmode == "visual" and visual_tbl: try: # Simple, standard encoding (No negative math) # We just add "photo of" to help CLIP focus query_vec = visual_model.encode(f"a photo of {q}") # Get 50 results to ensure variety vec_results = visual_tbl.search(query_vec).limit(50).to_list() for res in vec_results: # Deduplication: Don't show the same file 10 times uid = f"{res['filename']}-{res['page']}" if uid not in seen_files: seen_files.add(uid) results.append({ "type": "Visual Match", "filename": res['filename'], "page": res['page'], "text": f"Visual match for '{q}'", "score": 1.0 - res['_distance'] }) # Keep top 20 unique results results = results[:20] except Exception as e: print(f"Visual search error: {e}") return templates.TemplateResponse("partials/results.html", {"request": request, "results": results}) # --- MODE 2: TEXT SEARCH (Standard) --- # A. SQLite Keyword Search try: conn = get_db_connection() cursor = conn.cursor() cursor.execute(""" SELECT p.filename, p.page_number, snippet(pages_fts, 1, '', '', '...', 20) as snippet FROM pages_fts JOIN pages p ON pages_fts.rowid = p.id WHERE pages_fts MATCH ? ORDER BY rank LIMIT 20 """, (q,)) rows = cursor.fetchall() conn.close() for row in rows: uid = f"{row['filename']}-{row['page_number']}" if uid not in seen_files: results.append({ "type": "Exact Match", "filename": row['filename'], "page": row['page_number'], "text": row['snippet'], "score": 1.0 }) seen_files.add(uid) except Exception as e: print(f"Text SQL Error: {e}") # B. LanceDB Text Concept Search if tbl and searchmode == "text": try: vector_query = text_model.encode(q) vec_results = tbl.search(vector_query).limit(20).to_list() for res in vec_results: unique_id = f"{res['filename']}-{res['page_number']}" if unique_id not in seen_files: snippet = res['text'][:200] + "..." results.append({ "type": "Concept Match", "filename": res['filename'], "page": res['page_number'], "text": snippet, "score": 1.0 - res['_distance'] }) seen_files.add(unique_id) except: pass return templates.TemplateResponse("partials/results.html", {"request": request, "results": results}) @app.get("/view/{filename}", response_class=HTMLResponse) async def view_document(request: Request, filename: str, page: int = 1): filepath = None for root, dirs, files in os.walk(DATA_DIR): if filename in files: rel_path = os.path.relpath(os.path.join(root, filename), DATA_DIR) filepath = f"/files/{rel_path.replace(os.sep, '/')}" break if not filepath: raise HTTPException(status_code=404, detail="File not found") return templates.TemplateResponse("viewer.html", {"request": request, "filename": filename, "filepath": filepath, "page": page}) # --- API ENDPOINTS --- @app.get("/api/snap/{filename}/{page}") async def snap_evidence(filename: str, page: int): # Find file filepath = None for root, dirs, files in os.walk(DATA_DIR): if filename in files: filepath = os.path.join(root, filename) break if not filepath: raise HTTPException(status_code=404, detail="File not found") try: # Render doc = fitz.open(filepath) pdf_page = doc.load_page(page - 1) pix = pdf_page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0)) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) doc.close() # Watermark draw = ImageDraw.Draw(img) width, height = img.size footer_h = 60 draw.rectangle([(0, height - footer_h), (width, height)], fill="#000000") try: font = ImageFont.truetype("arial.ttf", 24) except: font = ImageFont.load_default() text = f"EVIDENCE: {filename} | PG {page} | SOURCE: EPSTEIN ARCHIVE" draw.text((20, height - 40), text, fill="white", font=font) # Return img_byteyb = io.BytesIO() img.save(img_byteyb, format='PNG') img_byteyb.seek(0) return Response(content=img_byteyb.getvalue(), media_type="image/png") except Exception as e: print(f"Snap error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.get("/api/similar/{filename}/{page}") async def similar_evidence(filename: str, page: int): if not tbl: return [] try: current_page = tbl.search().where(f"filename = '{filename}' AND page_number = {page}").limit(1).to_list() if not current_page: return [] vector = current_page[0]['vector'] results = tbl.search(vector).limit(6).to_list() similar = [] for res in results: if res['filename'] == filename and res['page_number'] == page: continue similar.append({ "filename": res['filename'], "page": res['page_number'], "snippet": res['text'][:150] + "..." }) return similar except: return [] if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)