Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| import re | |
| from pathlib import Path | |
| import tempfile | |
| import os | |
| FILE_PATH = Path("EPS_FILES_20K_NOV2025.csv") | |
| saved_items = [] | |
| print("Loading data...") | |
| try: | |
| df = pd.read_csv(FILE_PATH, on_bad_lines="skip") | |
| df.columns = [c.lower() for c in df.columns] | |
| text_col = "text" if "text" in df.columns else df.columns[-1] | |
| name_col = "filename" if "filename" in df.columns else df.columns[0] | |
| print(f"Success! Loaded {len(df)} documents.") | |
| except Exception as e: | |
| print(f"Error loading file: {e}") | |
| df = pd.DataFrame({"error": ["File not found. Check path and that the CSV is in the repo."]}) | |
| text_col = "error" | |
| name_col = "error" | |
| def base_view(): | |
| """Default view: first 50 docs with short snippets.""" | |
| out = df.head(50).copy() | |
| out["snippet"] = out[text_col].astype(str).str.slice(0, 280).str.replace("\n", " ") | |
| return out[[name_col, "snippet"]] | |
| def make_snippet_from_text(full_text: str, terms): | |
| """Create a short snippet around the first occurrence of any term.""" | |
| t = str(full_text) | |
| t_low = t.lower() | |
| positions = [t_low.find(term.lower()) for term in terms] | |
| positions = [p for p in positions if p != -1] | |
| first = min(positions) if positions else 0 | |
| start = max(first - 120, 0) | |
| end = start + 280 | |
| return t[start:end].replace("\n", " ") | |
| def search_documents(query: str): | |
| """ | |
| Multi-term AND search on text_col. | |
| Shows filename + short snippet around first match. | |
| """ | |
| if not query or len(query.strip()) < 2: | |
| return base_view() | |
| terms = [t.strip() for t in query.split() if t.strip()] | |
| if not terms: | |
| return base_view() | |
| def row_match(text): | |
| t = str(text).lower() | |
| return all(term.lower() in t for term in terms) | |
| matches = df[df[text_col].apply(row_match)].copy() | |
| matches["snippet"] = matches[text_col].apply(lambda t: make_snippet_from_text(t, terms)) | |
| return matches.head(500)[[name_col, "snippet"]] | |
| def display_document(evt: gr.SelectData, current_data, query: str): | |
| """ | |
| When a row is clicked, show full text with basic highlighting. | |
| current_data is the table currently displayed (filename + snippet). | |
| """ | |
| try: | |
| row_index = evt.index[0] | |
| row = current_data.iloc[row_index] | |
| doc_name = row[name_col] | |
| full_row = df[df[name_col] == doc_name].iloc[0] | |
| full_text = str(full_row[text_col]) | |
| terms = [t.strip() for t in (query or "").split() if t.strip()] | |
| for term in terms: | |
| pattern = re.compile(re.escape(term), re.IGNORECASE) | |
| full_text = pattern.sub(lambda m: f"**{m.group(0)}**", full_text) | |
| header = f"📄 **File:** `{doc_name}`\n\n" | |
| return header + full_text | |
| except Exception as e: | |
| return f"Error retrieving document text: {e}" | |
| def save_current_document(current_view_text: str, query: str): | |
| """ | |
| Save the currently viewed document (from doc_viewer markdown) into saved_items. | |
| current_view_text starts with '📄 **File:** `FILENAME`' followed by text. | |
| """ | |
| try: | |
| if not current_view_text.startswith("📄 **File:**"): | |
| # Nothing selected yet | |
| if saved_items: | |
| preview = pd.DataFrame(saved_items)[["filename", "snippet"]] | |
| else: | |
| preview = pd.DataFrame({"filename": ["<none>"], "snippet": ["No document selected."]}) | |
| return preview | |
| # Extract filename between backticks | |
| match = re.search(r"`([^`]+)`", current_view_text) | |
| if not match: | |
| raise ValueError("Could not parse filename from viewer header.") | |
| doc_name = match.group(1) | |
| full_row = df[df[name_col] == doc_name].iloc[0] | |
| full_text = str(full_row[text_col]) | |
| terms = [t.strip() for t in (query or "").split() if t.strip()] | |
| snippet = make_snippet_from_text(full_text, terms) if terms else full_text[:280].replace("\n", " ") | |
| saved_items.append( | |
| { | |
| "filename": doc_name, | |
| "snippet": snippet, | |
| "text": full_text, | |
| } | |
| ) | |
| preview = pd.DataFrame(saved_items)[["filename", "snippet"]] | |
| return preview | |
| except Exception as e: | |
| if saved_items: | |
| return pd.DataFrame(saved_items)[["filename", "snippet"]] | |
| else: | |
| return pd.DataFrame( | |
| {"filename": ["<none>"], "snippet": [f"Error saving document: {e}"]} | |
| ) | |
| def export_report(): | |
| """ | |
| Create a TXT report from all saved items and return the file path. | |
| Gradio will wrap this as a downloadable file. | |
| """ | |
| if not saved_items: | |
| content = "No items saved.\n" | |
| else: | |
| lines = [] | |
| for i, item in enumerate(saved_items, start=1): | |
| lines.append(f"=== Document {i} ===") | |
| lines.append(f"Filename: {item['filename']}") | |
| lines.append("Citation: U.S. House Oversight Epstein Estate Documents, https://huggingface.co/spaces/theelderemo/epstein-files, https://github.com/theelderemo/Epstein-files") | |
| lines.append("") | |
| lines.append(item["text"]) | |
| lines.append("\n\n") | |
| content = "\n".join(lines) | |
| fd, path = tempfile.mkstemp(suffix=".txt") | |
| with os.fdopen(fd, "w", encoding="utf-8") as f: | |
| f.write(content) | |
| return path | |
| with gr.Blocks(title="Epstein Docs Browser") as demo: | |
| # Content warning banner | |
| gr.Markdown( | |
| """ | |
| <div style="padding: 0.75rem 1rem; border-radius: 0.5rem; background-color: #2f0000; color: #ffd4d4; font-weight: 600;"> | |
| ⚠️ CONTENT WARNING: This corpus contains graphic and highly sensitive material, including sexual abuse, exploitation, trafficking, and violence, as well as unverified allegations and speculation. Proceed with caution. | |
| </div> | |
| """, | |
| ) | |
| gr.Markdown("# 📂 Epstein Estate Document Browser") | |
| # Responsible use summary | |
| gr.Markdown( | |
| """ | |
| ### Responsible use (read before searching) | |
| This dataset is a derivative collection of public documents released by the U.S. House Oversight Committee. It is intended **only** for research and exploratory analysis in support of public‑interest investigation. | |
| - Do **not** use this corpus to fine‑tune or train generative models. | |
| - Do **not** use it for doxing, harassment, or targeted attacks. | |
| - Do **not** attempt to circumvent or reverse redactions. | |
| - Do **not** present unverified allegations from these documents as established fact. | |
| You are solely responsible for complying with applicable law, institutional policies, and the terms of the original House release. If you plan to use this corpus in a public‑facing product or at scale, seek independent legal advice. | |
| ### The corpus contains: | |
| OCR noise, misrecognized characters, broken formatting, redaction blocks, stamps, and markers inherited from the original scans. Therefore, some of it may not be formatted correctly. Feel free to contribute, to improve the data. | |
| """ | |
| ) | |
| gr.Markdown( | |
| "Search 20,000+ documents. " | |
| "**Multiple words are treated as AND (all must appear). " | |
| "Click a row to read the full file below.**" | |
| ) | |
| with gr.Row(): | |
| search_box = gr.Textbox( | |
| label="Search (Keywords, Names, Flight Logs)", | |
| placeholder="Type here...", | |
| scale=3, | |
| ) | |
| search_btn = gr.Button("Search", variant="primary", scale=1) | |
| summary = gr.Markdown("") | |
| with gr.Row(): | |
| results_table = gr.Dataframe( | |
| headers=[name_col, "snippet"], | |
| datatype="str", | |
| label="Search Results (Click a row to view)", | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| with gr.Row(): | |
| doc_viewer = gr.Markdown( | |
| label="Document Content", | |
| value="Select a document above to read it here...", | |
| ) | |
| with gr.Row(): | |
| save_btn = gr.Button("Save current document to notebook") | |
| downloaded_file = gr.File(label="Download saved items (.txt)") | |
| saved_preview = gr.Dataframe( | |
| headers=["filename", "snippet"], | |
| datatype="str", | |
| label="Saved items (research notebook)", | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| # --- INTERACTIONS --- | |
| def run_search_and_summary(query): | |
| res = search_documents(query) | |
| return res, f"**{len(res)}** results shown." | |
| search_btn.click( | |
| fn=run_search_and_summary, | |
| inputs=search_box, | |
| outputs=[results_table, summary], | |
| ) | |
| search_box.submit( | |
| fn=run_search_and_summary, | |
| inputs=search_box, | |
| outputs=[results_table, summary], | |
| ) | |
| demo.load( | |
| fn=lambda: (base_view(), "**50** documents shown (initial sample)."), | |
| inputs=None, | |
| outputs=[results_table, summary], | |
| ) | |
| # Row click -> update viewer | |
| results_table.select( | |
| fn=display_document, | |
| inputs=[results_table, search_box], | |
| outputs=doc_viewer, | |
| ) | |
| # Save current viewer doc -> update saved_preview | |
| save_btn.click( | |
| fn=save_current_document, | |
| inputs=[doc_viewer, search_box], | |
| outputs=saved_preview, | |
| ) | |
| # Download TXT of saved items | |
| download_btn = gr.Button("Generate TXT report from saved items") | |
| download_btn.click( | |
| fn=export_report, | |
| inputs=None, | |
| outputs=downloaded_file, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |