Spaces:

AKMESSI
/

archive-explorer

Sleeping

File size: 10,678 Bytes

import os
import sqlite3
import lancedb
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import HTMLResponse, Response
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from sentence_transformers import SentenceTransformer
import uvicorn
import fitz # PyMuPDF
from PIL import Image, ImageDraw, ImageFont
import io
import zipfile
from huggingface_hub import hf_hub_download
import numpy as np

app = FastAPI()

# --- CONFIGURATION & UNZIPPING ---
print("📥 Downloading Data from Hugging Face Dataset...")

# 1. Download the ZIP file
zip_path = hf_hub_download(
    repo_id="AKMESSI/epstein-data", 
    filename="data.zip", 
    repo_type="dataset"
)

# 2. Extract it (if not already extracted)
DATA_DIR = "data"
if not os.path.exists(DATA_DIR):
    print("📦 Extracting data.zip... (This takes a moment)")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(".") # Extracts to current folder
    print("✅ Extraction Complete!")
else:
    print("✅ Data already extracted.")

# 3. Set DB Paths
# The zip contains "data/", so we look inside it
DB_NAME = "epstein.db" # This should ideally be uploaded separately if it's not in the zip
# If your DB is inside the data folder, update this path:
# DB_NAME = os.path.join(DATA_DIR, "epstein.db") 

VECTOR_DB_DIR = os.path.join(DATA_DIR, "lancedb")

# --- DATABASE INITIALIZATION ---
def init_db():
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    # 1. Main Pages
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS pages (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT,
            filepath TEXT,
            page_number INTEGER,
            text_content TEXT
        )
    """)
    # 2. FTS Virtual Table
    cursor.execute("""
        CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
            filename,
            text_content,
            content='pages',
            content_rowid='id'
        )
    """)
    # 3. Triggers
    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
            INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content);
        END;
    """)
    # 4. Analytics
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS search_analytics (
            term TEXT PRIMARY KEY,
            count INTEGER DEFAULT 1,
            last_searched TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """)
    conn.commit()
    conn.close()

init_db()

# --- CONNECT TO DB HELPERS ---
def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

# --- LOAD AI MODELS ---
print("Loading Text AI Model...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Loading Visual AI Model (CLIP)...")
visual_model = SentenceTransformer('clip-ViT-B-32')

# Connect to LanceDB
ldb = lancedb.connect(VECTOR_DB_DIR)

# Open Tables
try:
    tbl = ldb.open_table("pages") # Text Vectors
except:
    tbl = None

try:
    visual_tbl = ldb.open_table("visuals") # Visual Vectors
except:
    visual_tbl = None

# --- TEMPLATES ---
templates = Jinja2Templates(directory="templates")
app.mount("/files", StaticFiles(directory=DATA_DIR), name="files")

# --- ROUTES ---

@app.get("/", response_class=HTMLResponse)
async def home(request: Request):
    conn = get_db_connection()
    c = conn.cursor()
    try:
        c.execute("SELECT term, count FROM search_analytics ORDER BY count DESC LIMIT 5")
        trends = c.fetchall()
    except:
        trends = []
    conn.close()
    return templates.TemplateResponse("index.html", {"request": request, "trends": trends})

@app.get("/search", response_class=HTMLResponse)
async def search(request: Request, q: str, searchmode: str = "text"):
    if not q: return ""

    # 1. ANALYTICS (Keep existing)
    try:
        conn = get_db_connection()
        c = conn.cursor()
        c.execute("""
            INSERT INTO search_analytics (term, count, last_searched) 
            VALUES (?, 1, CURRENT_TIMESTAMP)
            ON CONFLICT(term) DO UPDATE SET count = count + 1, last_searched = CURRENT_TIMESTAMP
        """, (q.lower().strip(),))
        conn.commit()
        conn.close()
    except:
        pass

    results = []
    seen_files = set()

    # --- DEBUGGING: Check if DB is empty ---
    if searchmode == "visual" and visual_tbl:
        # Check total rows (Run this once to see in logs)
        print(f"🔍 Visual Index Size: {len(visual_tbl)} rows") 

    # --- MODE 1: VISUAL SEARCH (Standard & Reliable) ---
    if searchmode == "visual" and visual_tbl:
        try:
            # Simple, standard encoding (No negative math)
            # We just add "photo of" to help CLIP focus
            query_vec = visual_model.encode(f"a photo of {q}")
            
            # Get 50 results to ensure variety
            vec_results = visual_tbl.search(query_vec).limit(50).to_list()
            
            for res in vec_results:
                # Deduplication: Don't show the same file 10 times
                uid = f"{res['filename']}-{res['page']}"
                if uid not in seen_files:
                    seen_files.add(uid)
                    results.append({
                        "type": "Visual Match",
                        "filename": res['filename'],
                        "page": res['page'],
                        "text": f"Visual match for '{q}'",
                        "score": 1.0 - res['_distance']
                    })
                    
            # Keep top 20 unique results
            results = results[:20]

        except Exception as e:
            print(f"Visual search error: {e}")
            
        return templates.TemplateResponse("partials/results.html", {"request": request, "results": results})

    # --- MODE 2: TEXT SEARCH (Standard) ---
    # A. SQLite Keyword Search
    try:
        conn = get_db_connection()
        cursor = conn.cursor()
        cursor.execute("""
            SELECT p.filename, p.page_number, snippet(pages_fts, 1, '<b>', '</b>', '...', 20) as snippet 
            FROM pages_fts 
            JOIN pages p ON pages_fts.rowid = p.id 
            WHERE pages_fts MATCH ? 
            ORDER BY rank LIMIT 20
        """, (q,))
        rows = cursor.fetchall()
        conn.close()

        for row in rows:
            uid = f"{row['filename']}-{row['page_number']}"
            if uid not in seen_files:
                results.append({
                    "type": "Exact Match",
                    "filename": row['filename'],
                    "page": row['page_number'],
                    "text": row['snippet'], 
                    "score": 1.0
                })
                seen_files.add(uid)
    except Exception as e:
        print(f"Text SQL Error: {e}")

    # B. LanceDB Text Concept Search
    if tbl and searchmode == "text":
        try:
            vector_query = text_model.encode(q)
            vec_results = tbl.search(vector_query).limit(20).to_list()
            for res in vec_results:
                unique_id = f"{res['filename']}-{res['page_number']}"
                if unique_id not in seen_files:
                    snippet = res['text'][:200] + "..."
                    results.append({
                        "type": "Concept Match",
                        "filename": res['filename'],
                        "page": res['page_number'],
                        "text": snippet,
                        "score": 1.0 - res['_distance']
                    })
                    seen_files.add(unique_id)
        except:
            pass

    return templates.TemplateResponse("partials/results.html", {"request": request, "results": results})

@app.get("/view/{filename}", response_class=HTMLResponse)
async def view_document(request: Request, filename: str, page: int = 1):
    filepath = None
    for root, dirs, files in os.walk(DATA_DIR):
        if filename in files:
            rel_path = os.path.relpath(os.path.join(root, filename), DATA_DIR)
            filepath = f"/files/{rel_path.replace(os.sep, '/')}"
            break
    if not filepath: raise HTTPException(status_code=404, detail="File not found")
    
    return templates.TemplateResponse("viewer.html", {"request": request, "filename": filename, "filepath": filepath, "page": page})

# --- API ENDPOINTS ---

@app.get("/api/snap/{filename}/{page}")
async def snap_evidence(filename: str, page: int):
    # Find file
    filepath = None
    for root, dirs, files in os.walk(DATA_DIR):
        if filename in files:
            filepath = os.path.join(root, filename)
            break
    if not filepath: raise HTTPException(status_code=404, detail="File not found")

    try:
        # Render
        doc = fitz.open(filepath)
        pdf_page = doc.load_page(page - 1) 
        pix = pdf_page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        doc.close()

        # Watermark
        draw = ImageDraw.Draw(img)
        width, height = img.size
        footer_h = 60
        draw.rectangle([(0, height - footer_h), (width, height)], fill="#000000")
        try: font = ImageFont.truetype("arial.ttf", 24)
        except: font = ImageFont.load_default()
        text = f"EVIDENCE: {filename} | PG {page} | SOURCE: EPSTEIN ARCHIVE"
        draw.text((20, height - 40), text, fill="white", font=font)
        
        # Return
        img_byteyb = io.BytesIO()
        img.save(img_byteyb, format='PNG')
        img_byteyb.seek(0)
        return Response(content=img_byteyb.getvalue(), media_type="image/png")
    except Exception as e:
        print(f"Snap error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/similar/{filename}/{page}")
async def similar_evidence(filename: str, page: int):
    if not tbl: return []
    try:
        current_page = tbl.search().where(f"filename = '{filename}' AND page_number = {page}").limit(1).to_list()
        if not current_page: return []
        
        vector = current_page[0]['vector']
        results = tbl.search(vector).limit(6).to_list()
        
        similar = []
        for res in results:
            if res['filename'] == filename and res['page_number'] == page: continue
            similar.append({
                "filename": res['filename'], 
                "page": res['page_number'], 
                "snippet": res['text'][:150] + "..."
            })
        return similar
    except:
        return []

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)