AKMESSI commited on
Commit
d0a567e
·
1 Parent(s): 5c0acd3

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.db filter=lfs diff=lfs merge=lfs -text
3
+ *.lancedb filter=lfs diff=lfs merge=lfs -text
4
+ data/**/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- IGNORE HUGE DATA ---
2
+ data/
3
+ *.zip
4
+ *.db
5
+ *.lancedb
6
+ *.pdf
7
+
8
+ # --- IGNORE SYSTEM JUNK ---
9
+ __pycache__/
10
+ *.pyc
11
+ venv/
12
+ .venv/
13
+ .DS_Store
14
+ .env
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10 as base
2
+ FROM python:3.10
3
+
4
+ # 1. Install System Dependencies (Poppler for images)
5
+ USER root
6
+ RUN apt-get update && apt-get install -y \
7
+ poppler-utils \
8
+ ffmpeg \
9
+ libsm6 \
10
+ libxext6 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # 2. Set up a new user "user" (Security requirement for HF Spaces)
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ # 3. Set Working Directory
20
+ WORKDIR $HOME/app
21
+
22
+ # 4. Copy Dependencies
23
+ COPY --chown=user requirements.txt requirements.txt
24
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
25
+
26
+ # 5. Copy the Application Code & Data
27
+ COPY --chown=user . .
28
+
29
+ # 6. Expose the Port (Hugging Face expects port 7860)
30
+ EXPOSE 7860
31
+
32
+ # 7. Start the App
33
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import lancedb
4
+ from fastapi import FastAPI, Request, HTTPException
5
+ from fastapi.responses import HTMLResponse, Response
6
+ from fastapi.staticfiles import StaticFiles
7
+ from fastapi.templating import Jinja2Templates
8
+ from sentence_transformers import SentenceTransformer
9
+ import uvicorn
10
+ import fitz # PyMuPDF
11
+ from PIL import Image, ImageDraw, ImageFont
12
+ import io
13
+ import zipfile
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ app = FastAPI()
17
+
18
+ # --- CONFIGURATION & UNZIPPING ---
19
+ print("📥 Downloading Data from Hugging Face Dataset...")
20
+
21
+ # 1. Download the ZIP file
22
+ zip_path = hf_hub_download(
23
+ repo_id="AKMESSI/epstein-data",
24
+ filename="data.zip",
25
+ repo_type="dataset"
26
+ )
27
+
28
+ # 2. Extract it (if not already extracted)
29
+ DATA_DIR = "data"
30
+ if not os.path.exists(DATA_DIR):
31
+ print("📦 Extracting data.zip... (This takes a moment)")
32
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
33
+ zip_ref.extractall(".") # Extracts to current folder
34
+ print("✅ Extraction Complete!")
35
+ else:
36
+ print("✅ Data already extracted.")
37
+
38
+ # 3. Set DB Paths
39
+ # The zip contains "data/", so we look inside it
40
+ DB_NAME = "epstein.db" # This should ideally be uploaded separately if it's not in the zip
41
+ # If your DB is inside the data folder, update this path:
42
+ # DB_NAME = os.path.join(DATA_DIR, "epstein.db")
43
+
44
+ VECTOR_DB_DIR = os.path.join(DATA_DIR, "lancedb")
45
+
46
+ # --- DATABASE INITIALIZATION ---
47
+ def init_db():
48
+ conn = sqlite3.connect(DB_NAME)
49
+ cursor = conn.cursor()
50
+ # 1. Main Pages
51
+ cursor.execute("""
52
+ CREATE TABLE IF NOT EXISTS pages (
53
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
54
+ filename TEXT,
55
+ filepath TEXT,
56
+ page_number INTEGER,
57
+ text_content TEXT
58
+ )
59
+ """)
60
+ # 2. FTS Virtual Table
61
+ cursor.execute("""
62
+ CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
63
+ filename,
64
+ text_content,
65
+ content='pages',
66
+ content_rowid='id'
67
+ )
68
+ """)
69
+ # 3. Triggers
70
+ cursor.execute("""
71
+ CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
72
+ INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content);
73
+ END;
74
+ """)
75
+ # 4. Analytics
76
+ cursor.execute("""
77
+ CREATE TABLE IF NOT EXISTS search_analytics (
78
+ term TEXT PRIMARY KEY,
79
+ count INTEGER DEFAULT 1,
80
+ last_searched TIMESTAMP DEFAULT CURRENT_TIMESTAMP
81
+ )
82
+ """)
83
+ conn.commit()
84
+ conn.close()
85
+
86
+ init_db()
87
+
88
+ # --- CONNECT TO DB HELPERS ---
89
+ def get_db_connection():
90
+ conn = sqlite3.connect(DB_NAME)
91
+ conn.row_factory = sqlite3.Row
92
+ return conn
93
+
94
+ # --- LOAD AI MODELS ---
95
+ print("Loading Text AI Model...")
96
+ text_model = SentenceTransformer('all-MiniLM-L6-v2')
97
+
98
+ print("Loading Visual AI Model (CLIP)...")
99
+ visual_model = SentenceTransformer('clip-ViT-B-32')
100
+
101
+ # Connect to LanceDB
102
+ ldb = lancedb.connect(VECTOR_DB_DIR)
103
+
104
+ # Open Tables
105
+ try:
106
+ tbl = ldb.open_table("pages") # Text Vectors
107
+ except:
108
+ tbl = None
109
+
110
+ try:
111
+ visual_tbl = ldb.open_table("visuals") # Visual Vectors
112
+ except:
113
+ visual_tbl = None
114
+
115
+ # --- TEMPLATES ---
116
+ templates = Jinja2Templates(directory="templates")
117
+ app.mount("/files", StaticFiles(directory=DATA_DIR), name="files")
118
+
119
+ # --- ROUTES ---
120
+
121
+ @app.get("/", response_class=HTMLResponse)
122
+ async def home(request: Request):
123
+ conn = get_db_connection()
124
+ c = conn.cursor()
125
+ try:
126
+ c.execute("SELECT term, count FROM search_analytics ORDER BY count DESC LIMIT 5")
127
+ trends = c.fetchall()
128
+ except:
129
+ trends = []
130
+ conn.close()
131
+ return templates.TemplateResponse("index.html", {"request": request, "trends": trends})
132
+
133
+ @app.get("/search", response_class=HTMLResponse)
134
+ async def search(request: Request, q: str, searchmode: str = "text"):
135
+ if not q: return ""
136
+
137
+ # 1. ANALYTICS
138
+ try:
139
+ conn = get_db_connection()
140
+ c = conn.cursor()
141
+ c.execute("""
142
+ INSERT INTO search_analytics (term, count, last_searched)
143
+ VALUES (?, 1, CURRENT_TIMESTAMP)
144
+ ON CONFLICT(term) DO UPDATE SET count = count + 1, last_searched = CURRENT_TIMESTAMP
145
+ """, (q.lower().strip(),))
146
+ conn.commit()
147
+ conn.close()
148
+ except Exception as e:
149
+ print(f"Analytics error: {e}")
150
+
151
+ results = []
152
+ seen_files = set()
153
+
154
+ # --- MODE 1: VISUAL SEARCH ---
155
+ if searchmode == "visual" and visual_tbl:
156
+ try:
157
+ # Encode text query to Visual Vector Space
158
+ query_vec = visual_model.encode(q)
159
+ vec_results = visual_tbl.search(query_vec).limit(20).to_list()
160
+
161
+ for res in vec_results:
162
+ results.append({
163
+ "type": "Visual Match",
164
+ "filename": res['filename'],
165
+ "page": res['page'],
166
+ "text": f"Image match for '{q}'",
167
+ "score": 1.0 - res['_distance']
168
+ })
169
+ except Exception as e:
170
+ print(f"Visual search error: {e}")
171
+
172
+ return templates.TemplateResponse("partials/results.html", {"request": request, "results": results})
173
+
174
+ # --- MODE 2: TEXT/HYBRID SEARCH ---
175
+
176
+ # A. SQLite Keyword Search
177
+ conn = get_db_connection()
178
+ cursor = conn.cursor()
179
+ cursor.execute("""
180
+ SELECT p.filename, p.page_number, snippet(pages_fts, 1, '<b>', '</b>', '...', 20) as snippet
181
+ FROM pages_fts
182
+ JOIN pages p ON pages_fts.rowid = p.id
183
+ WHERE pages_fts MATCH ?
184
+ ORDER BY rank LIMIT 10
185
+ """, (q,))
186
+ rows = cursor.fetchall()
187
+ conn.close()
188
+
189
+ for row in rows:
190
+ results.append({
191
+ "type": "Exact Match",
192
+ "filename": row['filename'],
193
+ "page": row['page_number'],
194
+ "text": row['snippet'],
195
+ "score": 1.0
196
+ })
197
+ seen_files.add(f"{row['filename']}-{row['page_number']}")
198
+
199
+ # B. LanceDB Text Concept Search
200
+ if tbl:
201
+ try:
202
+ vector_query = text_model.encode(q)
203
+ vec_results = tbl.search(vector_query).limit(10).to_list()
204
+ for res in vec_results:
205
+ unique_id = f"{res['filename']}-{res['page_number']}"
206
+ if unique_id not in seen_files:
207
+ snippet = res['text'][:200] + "..."
208
+ results.append({
209
+ "type": "Concept Match",
210
+ "filename": res['filename'],
211
+ "page": res['page_number'],
212
+ "text": snippet,
213
+ "score": 1.0 - res['_distance']
214
+ })
215
+ except:
216
+ pass
217
+
218
+ return templates.TemplateResponse("partials/results.html", {"request": request, "results": results})
219
+
220
+ @app.get("/view/{filename}", response_class=HTMLResponse)
221
+ async def view_document(request: Request, filename: str, page: int = 1):
222
+ filepath = None
223
+ for root, dirs, files in os.walk(DATA_DIR):
224
+ if filename in files:
225
+ rel_path = os.path.relpath(os.path.join(root, filename), DATA_DIR)
226
+ filepath = f"/files/{rel_path.replace(os.sep, '/')}"
227
+ break
228
+ if not filepath: raise HTTPException(status_code=404, detail="File not found")
229
+
230
+ return templates.TemplateResponse("viewer.html", {"request": request, "filename": filename, "filepath": filepath, "page": page})
231
+
232
+ # --- API ENDPOINTS ---
233
+
234
+ @app.get("/api/snap/{filename}/{page}")
235
+ async def snap_evidence(filename: str, page: int):
236
+ # Find file
237
+ filepath = None
238
+ for root, dirs, files in os.walk(DATA_DIR):
239
+ if filename in files:
240
+ filepath = os.path.join(root, filename)
241
+ break
242
+ if not filepath: raise HTTPException(status_code=404, detail="File not found")
243
+
244
+ try:
245
+ # Render
246
+ doc = fitz.open(filepath)
247
+ pdf_page = doc.load_page(page - 1)
248
+ pix = pdf_page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
249
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
250
+ doc.close()
251
+
252
+ # Watermark
253
+ draw = ImageDraw.Draw(img)
254
+ width, height = img.size
255
+ footer_h = 60
256
+ draw.rectangle([(0, height - footer_h), (width, height)], fill="#000000")
257
+ try: font = ImageFont.truetype("arial.ttf", 24)
258
+ except: font = ImageFont.load_default()
259
+ text = f"EVIDENCE: {filename} | PG {page} | SOURCE: EPSTEIN ARCHIVE"
260
+ draw.text((20, height - 40), text, fill="white", font=font)
261
+
262
+ # Return
263
+ img_byteyb = io.BytesIO()
264
+ img.save(img_byteyb, format='PNG')
265
+ img_byteyb.seek(0)
266
+ return Response(content=img_byteyb.getvalue(), media_type="image/png")
267
+ except Exception as e:
268
+ print(f"Snap error: {e}")
269
+ raise HTTPException(status_code=500, detail=str(e))
270
+
271
+ @app.get("/api/similar/{filename}/{page}")
272
+ async def similar_evidence(filename: str, page: int):
273
+ if not tbl: return []
274
+ try:
275
+ current_page = tbl.search().where(f"filename = '{filename}' AND page_number = {page}").limit(1).to_list()
276
+ if not current_page: return []
277
+
278
+ vector = current_page[0]['vector']
279
+ results = tbl.search(vector).limit(6).to_list()
280
+
281
+ similar = []
282
+ for res in results:
283
+ if res['filename'] == filename and res['page_number'] == page: continue
284
+ similar.append({
285
+ "filename": res['filename'],
286
+ "page": res['page_number'],
287
+ "snippet": res['text'][:150] + "..."
288
+ })
289
+ return similar
290
+ except:
291
+ return []
292
+
293
+ if __name__ == "__main__":
294
+ uvicorn.run(app, host="0.0.0.0", port=7860)
ingest.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import lancedb
4
+ import PyPDF2
5
+ from sentence_transformers import SentenceTransformer
6
+ from lancedb.pydantic import LanceModel, Vector
7
+ import warnings
8
+
9
+ # Suppress warnings
10
+ warnings.filterwarnings("ignore")
11
+
12
+ # CONFIGURATION
13
+ DATA_DIR = "data"
14
+ DB_NAME = "epstein.db"
15
+ VECTOR_DB_DIR = "data/lancedb"
16
+
17
+ print("Initializing models and databases...")
18
+
19
+ # 1. Setup SQLite (For Keyword Search)
20
+ conn = sqlite3.connect(DB_NAME)
21
+ cursor = conn.cursor()
22
+
23
+ # Create main table and FTS (Full Text Search) virtual table
24
+ cursor.execute("""
25
+ CREATE TABLE IF NOT EXISTS pages (
26
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
27
+ filename TEXT,
28
+ filepath TEXT,
29
+ page_number INTEGER,
30
+ text_content TEXT
31
+ )
32
+ """)
33
+ cursor.execute("""
34
+ CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
35
+ filename,
36
+ text_content,
37
+ content='pages',
38
+ content_rowid='id'
39
+ )
40
+ """)
41
+ cursor.execute("""
42
+ CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
43
+ INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content);
44
+ END;
45
+ """)
46
+ conn.commit()
47
+
48
+ # 2. Setup LanceDB (For Vector/AI Search)
49
+ model = SentenceTransformer('all-MiniLM-L6-v2')
50
+ ldb = lancedb.connect(VECTOR_DB_DIR)
51
+
52
+ # --- THE FIX: Use Pydantic to define the Schema ---
53
+ class PageSchema(LanceModel):
54
+ vector: Vector(384) # 384 is the dimension of all-MiniLM-L6-v2
55
+ text: str
56
+ filename: str
57
+ page_number: int
58
+ filepath: str
59
+
60
+ # Create or Open the table using the Class Schema
61
+ try:
62
+ tbl = ldb.open_table("pages")
63
+ except:
64
+ tbl = ldb.create_table("pages", schema=PageSchema)
65
+ # --------------------------------------------------
66
+
67
+ def chunk_text(text, chunk_size=500):
68
+ """Split long page text into smaller chunks for better vector search"""
69
+ words = text.split()
70
+ chunks = []
71
+ current_chunk = []
72
+ current_length = 0
73
+
74
+ for word in words:
75
+ current_length += len(word) + 1
76
+ current_chunk.append(word)
77
+ if current_length >= chunk_size:
78
+ chunks.append(" ".join(current_chunk))
79
+ current_chunk = []
80
+ current_length = 0
81
+
82
+ if current_chunk:
83
+ chunks.append(" ".join(current_chunk))
84
+ return chunks
85
+
86
+ def process_pdf(filepath):
87
+ filename = os.path.basename(filepath)
88
+ print(f"Processing: {filename}...")
89
+
90
+ try:
91
+ with open(filepath, 'rb') as f:
92
+ reader = PyPDF2.PdfReader(f)
93
+ num_pages = len(reader.pages)
94
+
95
+ for i in range(num_pages):
96
+ try:
97
+ page = reader.pages[i]
98
+ text = page.extract_text()
99
+
100
+ # Junk Filter: Skip pages with too little text
101
+ if not text or len(text.strip()) < 50:
102
+ continue
103
+
104
+ clean_text = text.replace('\x00', '') # Remove null bytes
105
+
106
+ # A. Insert into SQLite (Keyword Search)
107
+ cursor.execute(
108
+ "INSERT INTO pages (filename, filepath, page_number, text_content) VALUES (?, ?, ?, ?)",
109
+ (filename, filepath, i + 1, clean_text)
110
+ )
111
+
112
+ # B. Insert into LanceDB (Vector Search)
113
+ chunks = chunk_text(clean_text)
114
+ vectors = model.encode(chunks)
115
+
116
+ data_to_add = []
117
+ for chunk, vector in zip(chunks, vectors):
118
+ data_to_add.append({
119
+ "vector": vector,
120
+ "text": chunk,
121
+ "filename": filename,
122
+ "page_number": i + 1,
123
+ "filepath": filepath
124
+ })
125
+
126
+ if data_to_add:
127
+ tbl.add(data_to_add)
128
+
129
+ except Exception as e:
130
+ print(f" Error on page {i+1}: {e}")
131
+
132
+ conn.commit()
133
+
134
+ except Exception as e:
135
+ print(f"Failed to read {filename}: {e}")
136
+
137
+ def main():
138
+ print(f"Scanning directory: {DATA_DIR}")
139
+ pdf_count = 0
140
+
141
+ for root, dirs, files in os.walk(DATA_DIR):
142
+ for file in files:
143
+ if file.lower().endswith('.pdf'):
144
+ full_path = os.path.join(root, file)
145
+ process_pdf(full_path)
146
+ pdf_count += 1
147
+
148
+ print(f"Done! Processed {pdf_count} PDF files.")
149
+ print("Run 'python app.py' next to start the server.")
150
+
151
+ if __name__ == "__main__":
152
+ main()
ingest_visual.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import lancedb
3
+ import fitz # PyMuPDF (The replacement for Poppler)
4
+ from sentence_transformers import SentenceTransformer
5
+ from lancedb.pydantic import LanceModel, Vector
6
+ from PIL import Image
7
+ import warnings
8
+
9
+ # Suppress warnings
10
+ warnings.filterwarnings("ignore")
11
+
12
+ # --- CONFIGURATION ---
13
+ DATA_DIR = "data"
14
+ VECTOR_DB_DIR = "data/lancedb"
15
+
16
+ print("Loading CLIP Model (Visual Intelligence)...")
17
+ model = SentenceTransformer('clip-ViT-B-32')
18
+
19
+ # Connect to DB
20
+ ldb = lancedb.connect(VECTOR_DB_DIR)
21
+
22
+ class VisualSchema(LanceModel):
23
+ vector: Vector(512)
24
+ filename: str
25
+ page: int
26
+ filepath: str
27
+
28
+ # Create or Open the table
29
+ try:
30
+ tbl = ldb.open_table("visuals")
31
+ except:
32
+ tbl = ldb.create_table("visuals", schema=VisualSchema)
33
+
34
+ def process_pdf_visuals(filepath):
35
+ filename = os.path.basename(filepath)
36
+ print(f"👀 Scanning visuals: {filename}...")
37
+
38
+ try:
39
+ # OPEN PDF WITH PYMUPDF (No Poppler needed)
40
+ doc = fitz.open(filepath)
41
+
42
+ data_to_add = []
43
+
44
+ for i, page in enumerate(doc):
45
+ try:
46
+ # Render page to image (RGB)
47
+ # matrix=fitz.Matrix(0.5, 0.5) scales it down for speed (approx 72-100 DPI)
48
+ pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
49
+
50
+ # Convert to PIL Image
51
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
52
+
53
+ # VISUAL EMBEDDING
54
+ vector = model.encode(img)
55
+
56
+ data_to_add.append({
57
+ "vector": vector,
58
+ "filename": filename,
59
+ "page": i + 1,
60
+ "filepath": filepath
61
+ })
62
+
63
+ if len(data_to_add) >= 10:
64
+ tbl.add(data_to_add)
65
+ data_to_add = []
66
+ except Exception as e:
67
+ # Skip pages that fail to render
68
+ continue
69
+
70
+ if data_to_add:
71
+ tbl.add(data_to_add)
72
+
73
+ doc.close()
74
+
75
+ except Exception as e:
76
+ print(f"Skipping {filename}: {e}")
77
+
78
+ def main():
79
+ print("Starting Visual Ingestion...")
80
+ for root, dirs, files in os.walk(DATA_DIR):
81
+ for file in files:
82
+ if file.lower().endswith('.pdf'):
83
+ process_pdf_visuals(os.path.join(root, file))
84
+ print("Visual Indexing Complete!")
85
+
86
+ if __name__ == "__main__":
87
+ main()
requirements.txt ADDED
Binary file (2.39 kB). View file
 
templates/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en" class="dark">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Epstein Archive Explorer</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <script src="https://unpkg.com/htmx.org@1.9.10"></script>
9
+ <style>
10
+ .highlight { background-color: #fde047; color: black; padding: 2px 4px; border-radius: 2px; }
11
+ b { background-color: #fde047; color: black; font-weight: normal; }
12
+
13
+ /* Custom Radio Button Styling */
14
+ .mode-radio:checked + div {
15
+ background-color: #2563eb;
16
+ color: white;
17
+ border-color: #2563eb;
18
+ }
19
+ .mode-radio:checked + div.visual-mode {
20
+ background-color: #9333ea; /* Purple for Visual */
21
+ border-color: #9333ea;
22
+ }
23
+ </style>
24
+ </head>
25
+ <body class="bg-slate-900 text-slate-100 min-h-screen font-sans">
26
+
27
+ <div class="max-w-4xl mx-auto pt-16 px-4">
28
+ <h1 class="text-5xl font-bold text-center mb-2 bg-gradient-to-r from-red-500 to-orange-500 bg-clip-text text-transparent">
29
+ ARCHIVE EXPLORER
30
+ </h1>
31
+ <p class="text-center text-slate-400 mb-8">
32
+ Indexed <span class="text-white font-mono">4,085</span> Documents • Visual AI Active
33
+ </p>
34
+
35
+ <div class="flex justify-center gap-4 mb-6">
36
+ <label class="cursor-pointer">
37
+ <input type="radio" name="searchmode" value="text" class="mode-radio sr-only" checked
38
+ onchange="htmx.trigger('#search-input', 'search')">
39
+ <div class="px-6 py-2 bg-slate-800 border border-slate-700 rounded-full transition text-slate-400 font-medium hover:border-blue-500">
40
+ 📄 Text Search
41
+ </div>
42
+ </label>
43
+ <label class="cursor-pointer">
44
+ <input type="radio" name="searchmode" value="visual" class="mode-radio sr-only"
45
+ onchange="htmx.trigger('#search-input', 'search')">
46
+ <div class="visual-mode px-6 py-2 bg-slate-800 border border-slate-700 rounded-full transition text-slate-400 font-medium hover:border-purple-500">
47
+ 👁️ Visual AI
48
+ </div>
49
+ </label>
50
+ </div>
51
+
52
+ <div class="relative group z-10">
53
+ <div class="absolute -inset-1 bg-gradient-to-r from-red-600 to-orange-600 rounded-lg blur opacity-25 group-hover:opacity-75 transition duration-1000 group-hover:duration-200"></div>
54
+ <input
55
+ id="search-input"
56
+ type="text"
57
+ name="q"
58
+ class="relative w-full bg-slate-800 text-white text-xl p-4 rounded-lg border border-slate-700 focus:outline-none focus:border-red-500 placeholder-slate-500 shadow-xl"
59
+ placeholder="Search evidence..."
60
+ hx-get="/search"
61
+ hx-include="[name='searchmode']"
62
+ hx-trigger="keyup changed delay:300ms, search"
63
+ hx-target="#results-area"
64
+ autocomplete="off"
65
+ >
66
+ </div>
67
+
68
+ {% if trends %}
69
+ <div class="flex flex-wrap justify-center gap-2 mt-6 text-sm">
70
+ <span class="text-xs text-slate-500 uppercase font-bold tracking-widest mr-2 pt-1">Trending:</span>
71
+ {% for trend in trends %}
72
+ <span class="px-3 py-1 bg-slate-800 rounded-full border border-slate-700 text-xs text-red-400 cursor-pointer hover:bg-slate-700 hover:text-white transition"
73
+ onclick="document.getElementById('search-input').value='{{ trend.term }}'; htmx.trigger('#search-input', 'search')">
74
+ 🔥 {{ trend.term }}
75
+ </span>
76
+ {% endfor %}
77
+ </div>
78
+ {% endif %}
79
+ </div>
80
+
81
+ <div id="results-area" class="max-w-4xl mx-auto mt-10 px-4 pb-20 space-y-4">
82
+ <div class="text-center text-slate-600 italic mt-20">
83
+ Select a mode and start typing to uncover evidence...
84
+ </div>
85
+ </div>
86
+
87
+ </body>
88
+ </html>
templates/partials/results.html ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% if not results %}
2
+ <div class="text-center text-slate-500 py-10">No documents found matching that query.</div>
3
+ {% endif %}
4
+
5
+ {% for result in results %}
6
+ <a href="/view/{{ result.filename }}?page={{ result.page }}" target="_blank" class="block group">
7
+ <div class="bg-slate-800 p-5 rounded-lg border border-slate-700 hover:border-red-500 transition shadow-lg relative overflow-hidden">
8
+
9
+ <div class="absolute top-0 right-0 p-2">
10
+ {% if result.type == 'Exact Match' %}
11
+ <span class="bg-blue-900 text-blue-200 text-xs font-bold px-2 py-1 rounded uppercase tracking-wider">Exact Keyword</span>
12
+ {% else %}
13
+ <span class="bg-purple-900 text-purple-200 text-xs font-bold px-2 py-1 rounded uppercase tracking-wider">AI Concept</span>
14
+ {% endif %}
15
+ </div>
16
+
17
+ <div class="flex items-center gap-3 mb-2">
18
+ <svg class="w-5 h-5 text-red-500" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path></svg>
19
+ <h3 class="font-bold text-lg text-slate-200 group-hover:text-red-400 transition">{{ result.filename }}</h3>
20
+ <span class="text-slate-500 text-sm">Page {{ result.page }}</span>
21
+ </div>
22
+
23
+ <p class="text-slate-400 text-sm leading-relaxed pl-8 border-l-2 border-slate-700">
24
+ ...{{ result.text|safe }}...
25
+ </p>
26
+ </div>
27
+ </a>
28
+ {% endfor %}
templates/viewer.html ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en" class="bg-slate-900 h-screen">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>{{ filename }} - Page {{ page }}</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ <style>
8
+ .detective-active {
9
+ filter: contrast(175%) brightness(90%) grayscale(100%) invert(0%);
10
+ }
11
+ .iframe-container {
12
+ transition: filter 0.3s ease;
13
+ }
14
+ </style>
15
+ <button onclick="toggleDetective()"
16
+ class="bg-yellow-600 hover:bg-yellow-700 text-white px-3 py-1.5 rounded text-sm font-bold flex items-center gap-2 transition">
17
+ <svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z"></path></svg>
18
+ Detective Mode
19
+ </button>
20
+
21
+ </head>
22
+ <body class="h-screen flex flex-col overflow-hidden">
23
+
24
+ <div class="h-14 bg-slate-800 border-b border-slate-700 flex items-center justify-between px-4 shrink-0 z-10">
25
+ <div class="flex items-center gap-4">
26
+ <a href="/" class="text-slate-400 hover:text-white transition">← Back to Search</a>
27
+ <h1 class="font-bold text-white truncate max-w-md">{{ filename }}</h1>
28
+ <span class="bg-red-600 text-white text-xs px-2 py-1 rounded">Page {{ page }}</span>
29
+ </div>
30
+ <div>
31
+ <button onclick="snapEvidence()"
32
+ class="bg-red-600 hover:bg-red-700 text-white px-4 py-2 rounded shadow-lg font-bold flex items-center gap-2 transition">
33
+ <svg class="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 9a2 2 0 012-2h.93a2 2 0 001.664-.89l.812-1.22A2 2 0 0110.07 4h3.86a2 2 0 011.664.89l.812 1.22A2 2 0 0018.07 7H19a2 2 0 012 2v9a2 2 0 01-2 2H5a2 2 0 01-2-2V9z"></path><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M15 13a3 3 0 11-6 0 3 3 0 016 0z"></path></svg>
34
+ Snap Evidence
35
+ </button>
36
+
37
+ <div id="snapModal" class="fixed inset-0 bg-black/90 hidden items-center justify-center z-50 p-4" onclick="closeModal()">
38
+ <div class="max-w-4xl w-full bg-slate-900 rounded-lg overflow-hidden shadow-2xl border border-slate-700" onclick="event.stopPropagation()">
39
+ <div class="p-4 border-b border-slate-700 flex justify-between items-center">
40
+ <h3 class="text-white font-bold">Evidence Receipt Generated</h3>
41
+ <button onclick="closeModal()" class="text-slate-400 hover:text-white">✕</button>
42
+ </div>
43
+ <div class="p-4 bg-black flex justify-center">
44
+ <img id="evidenceImg" src="" class="max-h-[70vh] object-contain border border-slate-800" />
45
+ </div>
46
+ <div class="p-4 flex gap-3 justify-end bg-slate-800">
47
+ <a id="downloadLink" href="#" download="evidence.png" class="bg-slate-700 hover:bg-slate-600 text-white px-4 py-2 rounded">Download</a>
48
+ <a id="twitterLink" href="#" target="_blank" class="bg-[#1DA1F2] hover:bg-[#1a91da] text-white px-4 py-2 rounded font-bold">Post to X</a>
49
+ </div>
50
+ </div>
51
+ </div>
52
+
53
+ <div class="absolute right-0 top-14 bottom-0 w-64 bg-slate-900 border-l border-slate-700 p-4 overflow-y-auto" id="related-panel">
54
+ <h3 class="text-xs font-bold text-slate-500 uppercase mb-4">Related Evidence (AI)</h3>
55
+ <div id="similar-results">
56
+ <div class="animate-pulse text-xs text-slate-600">Loading AI analysis...</div>
57
+ </div>
58
+ </div>
59
+
60
+ <script>
61
+ // Load similar pages automatically
62
+ fetch(`/api/similar/{{ filename }}/{{ page }}`)
63
+ .then(r => r.json())
64
+ .then(data => {
65
+ const container = document.getElementById('similar-results');
66
+ if(data.length === 0) {
67
+ container.innerHTML = '<div class="text-xs text-slate-600">No related links found.</div>';
68
+ return;
69
+ }
70
+
71
+ container.innerHTML = data.map(item => `
72
+ <a href="/view/${item.filename}?page=${item.page}" class="block mb-3 p-3 bg-slate-800 rounded hover:bg-slate-700 border border-slate-700 hover:border-blue-500 transition">
73
+ <div class="text-xs font-bold text-slate-300 truncate">${item.filename}</div>
74
+ <div class="text-[10px] text-blue-400 mb-1">Page ${item.page}</div>
75
+ <div class="text-[10px] text-slate-500 leading-tight">${item.snippet}</div>
76
+ </a>
77
+ `).join('');
78
+ });
79
+ </script>
80
+
81
+ <script>
82
+ async function snapEvidence() {
83
+ const btn = document.querySelector('button');
84
+ const originalText = btn.innerHTML;
85
+ btn.innerHTML = "Generating...";
86
+ btn.disabled = true;
87
+
88
+ try {
89
+ // Fetch the image from our new Python endpoint
90
+ const response = await fetch(`/api/snap/{{ filename }}/{{ page }}`);
91
+ const blob = await response.blob();
92
+ const url = URL.createObjectURL(blob);
93
+
94
+ // Setup Modal
95
+ const img = document.getElementById('evidenceImg');
96
+ img.src = url;
97
+
98
+ const dl = document.getElementById('downloadLink');
99
+ dl.href = url;
100
+ dl.download = `Epstein_Evidence_{{ filename }}_Pg{{ page }}.png`;
101
+
102
+ const tw = document.getElementById('twitterLink');
103
+ const text = encodeURIComponent(`Found in the Epstein Archive: "{{ filename }}" Page {{ page }}. \n\nIndexed via Archive Explorer.`);
104
+ tw.href = `https://twitter.com/intent/tweet?text=${text}`;
105
+
106
+ // Show Modal
107
+ document.getElementById('snapModal').classList.remove('hidden');
108
+ document.getElementById('snapModal').classList.add('flex');
109
+ } catch (e) {
110
+ alert("Error generating snap. Is Poppler installed?");
111
+ } finally {
112
+ btn.innerHTML = originalText;
113
+ btn.disabled = false;
114
+ }
115
+ }
116
+
117
+ function closeModal() {
118
+ document.getElementById('snapModal').classList.add('hidden');
119
+ document.getElementById('snapModal').classList.remove('flex');
120
+ }
121
+
122
+
123
+ function toggleDetective() {
124
+ const iframe = document.querySelector('iframe');
125
+ iframe.classList.toggle('detective-active');
126
+ }
127
+ </script>
128
+ </div>
129
+ </div>
130
+
131
+ <iframe src="{{ filepath }}#page={{ page }}" class="w-full h-full border-none bg-slate-500"></iframe>
132
+
133
+ </body>
134
+ </html>