| """ |
| Epstein Files Dataset Loader |
| |
| Loads data from two HuggingFace sources: |
| 1. teyler/epstein-files-20k — raw OCR text (2.1M rows, filename + text) |
| 2. devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB — pre-computed |
| all-MiniLM-L6-v2 embeddings in ChromaDB format |
| |
| Both can feed directly into the ContextualSimilarityEngine pipeline. |
| """ |
|
|
| import logging |
| import re |
| import time |
| from pathlib import Path |
| from typing import Optional |
|
|
| import numpy as np |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| RAW_DATASET = "teyler/epstein-files-20k" |
| EMBEDDINGS_DATASET = "devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB" |
|
|
|
|
| def load_raw_dataset( |
| max_docs: Optional[int] = None, |
| min_text_length: int = 100, |
| source_filter: Optional[str] = None, |
| ) -> list[dict]: |
| """ |
| Load raw Epstein Files from HuggingFace. |
| |
| Args: |
| max_docs: Limit number of documents loaded (None = all ~2.1M). |
| min_text_length: Skip documents shorter than this. |
| source_filter: Filter by filename prefix, e.g. "TEXT-" or "IMAGES-". |
| |
| Returns: |
| List of {"doc_id": str, "text": str, "filename": str} |
| """ |
| from datasets import load_dataset |
|
|
| t0 = time.time() |
| logger.info(f"Loading {RAW_DATASET} from HuggingFace...") |
|
|
| ds = load_dataset(RAW_DATASET, split="train") |
| docs = [] |
|
|
| for i, row in enumerate(ds): |
| if max_docs and len(docs) >= max_docs: |
| break |
|
|
| text = (row.get("text") or "").strip() |
| filename = row.get("filename") or f"doc_{i}" |
|
|
| if len(text) < min_text_length: |
| continue |
|
|
| if source_filter and not filename.startswith(source_filter): |
| continue |
|
|
| doc_id = Path(filename).stem |
| docs.append({"doc_id": doc_id, "text": text, "filename": filename}) |
|
|
| elapsed = time.time() - t0 |
| logger.info(f"Loaded {len(docs)} documents in {elapsed:.1f}s") |
| return docs |
|
|
|
|
| def load_raw_to_engine( |
| engine, |
| max_docs: Optional[int] = 500, |
| min_text_length: int = 100, |
| source_filter: Optional[str] = None, |
| build_index: bool = True, |
| ) -> dict: |
| """ |
| Load raw dataset directly into a ContextualSimilarityEngine. |
| |
| Args: |
| engine: ContextualSimilarityEngine instance (must be initialized). |
| max_docs: Limit documents to load. |
| min_text_length: Skip short documents. |
| source_filter: Filter by filename prefix. |
| build_index: Whether to build FAISS index after loading. |
| |
| Returns: |
| Stats dict with counts and timing. |
| """ |
| t0 = time.time() |
| docs = load_raw_dataset(max_docs, min_text_length, source_filter) |
|
|
| total_chunks = 0 |
| skipped = 0 |
| for doc in docs: |
| try: |
| chunks = engine.add_document(doc["doc_id"], doc["text"]) |
| total_chunks += len(chunks) |
| except ValueError as e: |
| logger.warning("Skipped document '%s': %s", doc["doc_id"], e) |
| skipped += 1 |
|
|
| if build_index and total_chunks > 0: |
| engine.build_index(show_progress=True) |
|
|
| elapsed = time.time() - t0 |
| return { |
| "documents_loaded": len(docs) - skipped, |
| "documents_skipped": skipped, |
| "total_chunks": total_chunks, |
| "index_built": build_index and total_chunks > 0, |
| "seconds": round(elapsed, 2), |
| } |
|
|
|
|
| def load_chromadb_embeddings( |
| download_dir: str = "./chroma_epstein", |
| ) -> dict: |
| """ |
| Download and load the pre-computed ChromaDB embeddings. |
| |
| Returns: |
| Dict with "texts", "embeddings", "metadatas", "ids", and stats. |
| """ |
| import chromadb |
| from huggingface_hub import snapshot_download |
|
|
| t0 = time.time() |
| logger.info(f"Downloading {EMBEDDINGS_DATASET} from HuggingFace...") |
|
|
| |
| |
| local_path = snapshot_download( |
| repo_id=EMBEDDINGS_DATASET, |
| repo_type="dataset", |
| local_dir=download_dir, |
| ) |
|
|
| |
| chroma_dir = None |
| for candidate in [ |
| Path(local_path) / "chroma_db", |
| Path(local_path), |
| ]: |
| if (candidate / "chroma.sqlite3").exists(): |
| chroma_dir = str(candidate) |
| break |
|
|
| if not chroma_dir: |
| raise FileNotFoundError( |
| f"ChromaDB files not found in {local_path}. " |
| f"Expected chroma.sqlite3 in the download." |
| ) |
|
|
| |
| client = chromadb.PersistentClient(path=chroma_dir) |
| collections = client.list_collections() |
| if not collections: |
| raise ValueError("No collections found in ChromaDB.") |
|
|
| collection = collections[0] |
| count = collection.count() |
| logger.info(f"ChromaDB collection '{collection.name}': {count} vectors") |
|
|
| elapsed = time.time() - t0 |
| return { |
| "chroma_dir": chroma_dir, |
| "collection_name": collection.name, |
| "total_vectors": count, |
| "seconds": round(elapsed, 2), |
| "_collection": collection, |
| "_client": client, |
| } |
|
|
|
|
| def import_chromadb_to_engine( |
| engine, |
| max_chunks: Optional[int] = None, |
| batch_size: int = 1000, |
| ) -> dict: |
| """ |
| Import pre-computed ChromaDB embeddings into the engine's FAISS index. |
| |
| Since both use all-MiniLM-L6-v2 (384-dim), we can directly import |
| the vectors without re-encoding. |
| |
| Args: |
| engine: ContextualSimilarityEngine (must be initialized with all-MiniLM-L6-v2). |
| max_chunks: Limit vectors to import (None = all). |
| batch_size: How many vectors to fetch from ChromaDB at a time. |
| |
| Returns: |
| Stats dict. |
| """ |
| t0 = time.time() |
| chroma_data = load_chromadb_embeddings() |
| collection = chroma_data["_collection"] |
| total = chroma_data["total_vectors"] |
|
|
| if max_chunks: |
| total = min(total, max_chunks) |
|
|
| |
| all_texts = [] |
| all_embeddings = [] |
| all_sources = [] |
|
|
| offset = 0 |
| while offset < total: |
| limit = min(batch_size, total - offset) |
| results = collection.get( |
| limit=limit, |
| offset=offset, |
| include=["embeddings", "documents", "metadatas"], |
| ) |
|
|
| if not results["ids"]: |
| break |
|
|
| for i, doc_id in enumerate(results["ids"]): |
| text = results["documents"][i] if results["documents"] is not None else "" |
| embedding = results["embeddings"][i] if results["embeddings"] is not None else None |
| metadata = results["metadatas"][i] if results["metadatas"] is not None else {} |
| source = metadata.get("source", f"chunk_{offset + i}") |
|
|
| if text and embedding is not None: |
| all_texts.append(text) |
| all_embeddings.append(embedding) |
| all_sources.append(source) |
|
|
| offset += len(results["ids"]) |
| logger.info(f"Fetched {offset}/{total} vectors from ChromaDB") |
|
|
| |
| doc_chunks = {} |
| for text, source in zip(all_texts, all_sources): |
| stem = Path(source).stem if source else "unknown" |
| if stem not in doc_chunks: |
| doc_chunks[stem] = [] |
| doc_chunks[stem].append(text) |
|
|
| docs_added = 0 |
| chunks_added = 0 |
| for doc_id, texts in doc_chunks.items(): |
| combined = "\n\n".join(texts) |
| try: |
| chunks = engine.add_document(doc_id, combined) |
| chunks_added += len(chunks) |
| docs_added += 1 |
| except ValueError as e: |
| logger.warning("Skipped ChromaDB document '%s': %s", doc_id, e) |
|
|
| if chunks_added > 0: |
| engine.build_index(show_progress=True) |
|
|
| elapsed = time.time() - t0 |
| return { |
| "source": "chromadb_embeddings", |
| "chromadb_vectors": len(all_embeddings), |
| "documents_created": docs_added, |
| "chunks_indexed": chunks_added, |
| "index_built": chunks_added > 0, |
| "seconds": round(elapsed, 2), |
| } |
|
|
|
|
| def get_dataset_info() -> dict: |
| """Return metadata about available datasets (no download).""" |
| return { |
| "raw_texts": { |
| "dataset_id": RAW_DATASET, |
| "url": f"https://huggingface.co/datasets/{RAW_DATASET}", |
| "description": "2.1M OCR text documents from U.S. House Oversight Committee Epstein Files release", |
| "columns": ["filename", "text"], |
| "size_mb": 106, |
| }, |
| "embeddings": { |
| "dataset_id": EMBEDDINGS_DATASET, |
| "url": f"https://huggingface.co/datasets/{EMBEDDINGS_DATASET}", |
| "description": "Pre-computed all-MiniLM-L6-v2 embeddings in ChromaDB format (~100K+ chunks)", |
| "model": "all-MiniLM-L6-v2", |
| "vector_dim": 384, |
| }, |
| } |
|
|