Spaces:
Sleeping
Sleeping
| import os | |
| import sqlite3 | |
| import lancedb | |
| from fastapi import FastAPI, Request, HTTPException | |
| from fastapi.responses import HTMLResponse, Response | |
| from fastapi.staticfiles import StaticFiles | |
| from fastapi.templating import Jinja2Templates | |
| from sentence_transformers import SentenceTransformer | |
| import uvicorn | |
| import fitz # PyMuPDF | |
| from PIL import Image, ImageDraw, ImageFont | |
| import io | |
| import zipfile | |
| from huggingface_hub import hf_hub_download | |
| import numpy as np | |
| app = FastAPI() | |
| # --- CONFIGURATION & UNZIPPING --- | |
| print("📥 Downloading Data from Hugging Face Dataset...") | |
| # 1. Download the ZIP file | |
| zip_path = hf_hub_download( | |
| repo_id="AKMESSI/epstein-data", | |
| filename="data.zip", | |
| repo_type="dataset" | |
| ) | |
| # 2. Extract it (if not already extracted) | |
| DATA_DIR = "data" | |
| if not os.path.exists(DATA_DIR): | |
| print("📦 Extracting data.zip... (This takes a moment)") | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(".") # Extracts to current folder | |
| print("✅ Extraction Complete!") | |
| else: | |
| print("✅ Data already extracted.") | |
| # 3. Set DB Paths | |
| # The zip contains "data/", so we look inside it | |
| DB_NAME = "epstein.db" # This should ideally be uploaded separately if it's not in the zip | |
| # If your DB is inside the data folder, update this path: | |
| # DB_NAME = os.path.join(DATA_DIR, "epstein.db") | |
| VECTOR_DB_DIR = os.path.join(DATA_DIR, "lancedb") | |
| # --- DATABASE INITIALIZATION --- | |
| def init_db(): | |
| conn = sqlite3.connect(DB_NAME) | |
| cursor = conn.cursor() | |
| # 1. Main Pages | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS pages ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| filename TEXT, | |
| filepath TEXT, | |
| page_number INTEGER, | |
| text_content TEXT | |
| ) | |
| """) | |
| # 2. FTS Virtual Table | |
| cursor.execute(""" | |
| CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( | |
| filename, | |
| text_content, | |
| content='pages', | |
| content_rowid='id' | |
| ) | |
| """) | |
| # 3. Triggers | |
| cursor.execute(""" | |
| CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN | |
| INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content); | |
| END; | |
| """) | |
| # 4. Analytics | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS search_analytics ( | |
| term TEXT PRIMARY KEY, | |
| count INTEGER DEFAULT 1, | |
| last_searched TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| conn.commit() | |
| conn.close() | |
| init_db() | |
| # --- CONNECT TO DB HELPERS --- | |
| def get_db_connection(): | |
| conn = sqlite3.connect(DB_NAME) | |
| conn.row_factory = sqlite3.Row | |
| return conn | |
| # --- LOAD AI MODELS --- | |
| print("Loading Text AI Model...") | |
| text_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print("Loading Visual AI Model (CLIP)...") | |
| visual_model = SentenceTransformer('clip-ViT-B-32') | |
| # Connect to LanceDB | |
| ldb = lancedb.connect(VECTOR_DB_DIR) | |
| # Open Tables | |
| try: | |
| tbl = ldb.open_table("pages") # Text Vectors | |
| except: | |
| tbl = None | |
| try: | |
| visual_tbl = ldb.open_table("visuals") # Visual Vectors | |
| except: | |
| visual_tbl = None | |
| # --- TEMPLATES --- | |
| templates = Jinja2Templates(directory="templates") | |
| app.mount("/files", StaticFiles(directory=DATA_DIR), name="files") | |
| # --- ROUTES --- | |
| async def home(request: Request): | |
| conn = get_db_connection() | |
| c = conn.cursor() | |
| try: | |
| c.execute("SELECT term, count FROM search_analytics ORDER BY count DESC LIMIT 5") | |
| trends = c.fetchall() | |
| except: | |
| trends = [] | |
| conn.close() | |
| return templates.TemplateResponse("index.html", {"request": request, "trends": trends}) | |
| async def search(request: Request, q: str, searchmode: str = "text"): | |
| if not q: return "" | |
| # 1. ANALYTICS (Keep existing) | |
| try: | |
| conn = get_db_connection() | |
| c = conn.cursor() | |
| c.execute(""" | |
| INSERT INTO search_analytics (term, count, last_searched) | |
| VALUES (?, 1, CURRENT_TIMESTAMP) | |
| ON CONFLICT(term) DO UPDATE SET count = count + 1, last_searched = CURRENT_TIMESTAMP | |
| """, (q.lower().strip(),)) | |
| conn.commit() | |
| conn.close() | |
| except: | |
| pass | |
| results = [] | |
| seen_files = set() | |
| # --- DEBUGGING: Check if DB is empty --- | |
| if searchmode == "visual" and visual_tbl: | |
| # Check total rows (Run this once to see in logs) | |
| print(f"🔍 Visual Index Size: {len(visual_tbl)} rows") | |
| # --- MODE 1: VISUAL SEARCH (Standard & Reliable) --- | |
| if searchmode == "visual" and visual_tbl: | |
| try: | |
| # Simple, standard encoding (No negative math) | |
| # We just add "photo of" to help CLIP focus | |
| query_vec = visual_model.encode(f"a photo of {q}") | |
| # Get 50 results to ensure variety | |
| vec_results = visual_tbl.search(query_vec).limit(50).to_list() | |
| for res in vec_results: | |
| # Deduplication: Don't show the same file 10 times | |
| uid = f"{res['filename']}-{res['page']}" | |
| if uid not in seen_files: | |
| seen_files.add(uid) | |
| results.append({ | |
| "type": "Visual Match", | |
| "filename": res['filename'], | |
| "page": res['page'], | |
| "text": f"Visual match for '{q}'", | |
| "score": 1.0 - res['_distance'] | |
| }) | |
| # Keep top 20 unique results | |
| results = results[:20] | |
| except Exception as e: | |
| print(f"Visual search error: {e}") | |
| return templates.TemplateResponse("partials/results.html", {"request": request, "results": results}) | |
| # --- MODE 2: TEXT SEARCH (Standard) --- | |
| # A. SQLite Keyword Search | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| SELECT p.filename, p.page_number, snippet(pages_fts, 1, '<b>', '</b>', '...', 20) as snippet | |
| FROM pages_fts | |
| JOIN pages p ON pages_fts.rowid = p.id | |
| WHERE pages_fts MATCH ? | |
| ORDER BY rank LIMIT 20 | |
| """, (q,)) | |
| rows = cursor.fetchall() | |
| conn.close() | |
| for row in rows: | |
| uid = f"{row['filename']}-{row['page_number']}" | |
| if uid not in seen_files: | |
| results.append({ | |
| "type": "Exact Match", | |
| "filename": row['filename'], | |
| "page": row['page_number'], | |
| "text": row['snippet'], | |
| "score": 1.0 | |
| }) | |
| seen_files.add(uid) | |
| except Exception as e: | |
| print(f"Text SQL Error: {e}") | |
| # B. LanceDB Text Concept Search | |
| if tbl and searchmode == "text": | |
| try: | |
| vector_query = text_model.encode(q) | |
| vec_results = tbl.search(vector_query).limit(20).to_list() | |
| for res in vec_results: | |
| unique_id = f"{res['filename']}-{res['page_number']}" | |
| if unique_id not in seen_files: | |
| snippet = res['text'][:200] + "..." | |
| results.append({ | |
| "type": "Concept Match", | |
| "filename": res['filename'], | |
| "page": res['page_number'], | |
| "text": snippet, | |
| "score": 1.0 - res['_distance'] | |
| }) | |
| seen_files.add(unique_id) | |
| except: | |
| pass | |
| return templates.TemplateResponse("partials/results.html", {"request": request, "results": results}) | |
| async def view_document(request: Request, filename: str, page: int = 1): | |
| filepath = None | |
| for root, dirs, files in os.walk(DATA_DIR): | |
| if filename in files: | |
| rel_path = os.path.relpath(os.path.join(root, filename), DATA_DIR) | |
| filepath = f"/files/{rel_path.replace(os.sep, '/')}" | |
| break | |
| if not filepath: raise HTTPException(status_code=404, detail="File not found") | |
| return templates.TemplateResponse("viewer.html", {"request": request, "filename": filename, "filepath": filepath, "page": page}) | |
| # --- API ENDPOINTS --- | |
| async def snap_evidence(filename: str, page: int): | |
| # Find file | |
| filepath = None | |
| for root, dirs, files in os.walk(DATA_DIR): | |
| if filename in files: | |
| filepath = os.path.join(root, filename) | |
| break | |
| if not filepath: raise HTTPException(status_code=404, detail="File not found") | |
| try: | |
| # Render | |
| doc = fitz.open(filepath) | |
| pdf_page = doc.load_page(page - 1) | |
| pix = pdf_page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0)) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| doc.close() | |
| # Watermark | |
| draw = ImageDraw.Draw(img) | |
| width, height = img.size | |
| footer_h = 60 | |
| draw.rectangle([(0, height - footer_h), (width, height)], fill="#000000") | |
| try: font = ImageFont.truetype("arial.ttf", 24) | |
| except: font = ImageFont.load_default() | |
| text = f"EVIDENCE: {filename} | PG {page} | SOURCE: EPSTEIN ARCHIVE" | |
| draw.text((20, height - 40), text, fill="white", font=font) | |
| # Return | |
| img_byteyb = io.BytesIO() | |
| img.save(img_byteyb, format='PNG') | |
| img_byteyb.seek(0) | |
| return Response(content=img_byteyb.getvalue(), media_type="image/png") | |
| except Exception as e: | |
| print(f"Snap error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def similar_evidence(filename: str, page: int): | |
| if not tbl: return [] | |
| try: | |
| current_page = tbl.search().where(f"filename = '{filename}' AND page_number = {page}").limit(1).to_list() | |
| if not current_page: return [] | |
| vector = current_page[0]['vector'] | |
| results = tbl.search(vector).limit(6).to_list() | |
| similar = [] | |
| for res in results: | |
| if res['filename'] == filename and res['page_number'] == page: continue | |
| similar.append({ | |
| "filename": res['filename'], | |
| "page": res['page_number'], | |
| "snippet": res['text'][:150] + "..." | |
| }) | |
| return similar | |
| except: | |
| return [] | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |