| import { useState, useEffect } from "react"; |
| import { api, getErrorMessage } from "../api"; |
| import type { DatasetInfo, DatasetLoadResponse, DatasetPreviewDoc } from "../types"; |
| import StatusMessage from "./StatusMessage"; |
| import MetricCard from "./MetricCard"; |
| import Toggle from "./Toggle"; |
| import Select from "./Select"; |
| import Switch from "./Switch"; |
| import LogViewer from "./LogViewer"; |
|
|
| interface Props { |
| onStatsUpdate?: (stats: any) => void; |
| } |
|
|
| export default function DatasetPanel({ onStatsUpdate }: Props) { |
| const [info, setInfo] = useState<DatasetInfo | null>(null); |
| const [error, setError] = useState(""); |
|
|
| |
| const [source, setSource] = useState<"raw" | "embeddings">("raw"); |
| const [maxDocs, setMaxDocs] = useState(500); |
| const [minTextLen, setMinTextLen] = useState(100); |
| const [sourceFilter, setSourceFilter] = useState(""); |
| const [loadAll, setLoadAll] = useState(true); |
| const [buildIndex, setBuildIndex] = useState(true); |
| const [loading, setLoading] = useState(false); |
| const [loadResult, setLoadResult] = useState<DatasetLoadResponse | null>(null); |
| const [showAdvanced, setShowAdvanced] = useState(false); |
|
|
| |
| const [previewDocs, setPreviewDocs] = useState<DatasetPreviewDoc[]>([]); |
| const [previewLoading, setPreviewLoading] = useState(false); |
|
|
| useEffect(() => { |
| api.datasetInfo().then(setInfo).catch((err) => { |
| setError(getErrorMessage(err)); |
| }); |
| }, []); |
|
|
| async function handlePreview() { |
| setPreviewLoading(true); setError(""); |
| try { |
| const res = await api.datasetPreview(10, sourceFilter || undefined); |
| setPreviewDocs(res.documents); |
| } catch (err) { |
| setError(getErrorMessage(err)); |
| } finally { |
| setPreviewLoading(false); |
| } |
| } |
|
|
| async function handleLoad() { |
| setLoading(true); setError(""); setLoadResult(null); |
| try { |
| const res = await api.datasetLoad({ |
| source, |
| max_docs: loadAll ? 100000 : maxDocs, |
| min_text_length: loadAll ? 0 : minTextLen, |
| source_filter: sourceFilter || undefined, |
| build_index: buildIndex, |
| }); |
| setLoadResult(res); |
| if (onStatsUpdate) { |
| try { const s = await api.getStats(); onStatsUpdate(s); } catch (e) { |
| console.warn("Failed to refresh stats after load:", e); |
| } |
| } |
| } catch (err) { |
| setError(getErrorMessage(err)); |
| } finally { |
| setLoading(false); |
| } |
| } |
|
|
| return ( |
| <div> |
| {/* Info */} |
| <div className="panel"> |
| <h2>Epstein Files Dataset</h2> |
| <p className="panel-desc"> |
| Load documents from the publicly released U.S. House Oversight Committee Epstein Files |
| via HuggingFace. Two sources available: |
| </p> |
| |
| {info && ( |
| <div style={{ display: "flex", gap: 12, flexWrap: "wrap", marginBottom: 16 }}> |
| <div className={`result-card ${source === "raw" ? "result-card-selected" : ""}`} |
| style={{ flex: "1 1 280px", cursor: "pointer" }} |
| onClick={() => setSource("raw")}> |
| <div className="result-header"> |
| <strong>Raw Text Documents</strong> |
| <span className="badge">{info.raw_texts.size_mb} MB</span> |
| </div> |
| <div className="result-text">{info.raw_texts.description}</div> |
| <div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}> |
| Columns: {info.raw_texts.columns?.join(", ")} |
| </div> |
| </div> |
| <div className={`result-card ${source === "embeddings" ? "result-card-selected" : ""}`} |
| style={{ flex: "1 1 280px", cursor: "pointer" }} |
| onClick={() => setSource("embeddings")}> |
| <div className="result-header"> |
| <strong>Pre-computed Embeddings</strong> |
| <span className="badge">{info.embeddings.vector_dim}d</span> |
| </div> |
| <div className="result-text">{info.embeddings.description}</div> |
| <div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}> |
| Model: {info.embeddings.model} |
| </div> |
| </div> |
| </div> |
| )} |
| |
| <Toggle |
| options={[ |
| { value: "raw", label: "Raw Texts" }, |
| { value: "embeddings", label: "ChromaDB Embeddings" }, |
| ]} |
| value={source} |
| onChange={(v) => setSource(v as "raw" | "embeddings")} |
| /> |
| </div> |
| |
| {/* Load actions + advanced config */} |
| <div className="panel"> |
| <h2>Load Dataset</h2> |
| <div style={{ display: "flex", gap: 8, marginBottom: 12 }}> |
| <button className="btn btn-primary" onClick={handleLoad} |
| disabled={loading}> |
| {loading ? <><span className="spinner" /> Loading Dataset...</> : "Load into Engine"} |
| </button> |
| {source === "raw" && ( |
| <button className="btn btn-secondary" onClick={handlePreview} |
| disabled={previewLoading}> |
| {previewLoading ? "Loading..." : "Preview Documents"} |
| </button> |
| )} |
| </div> |
| |
| <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}> |
| {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings |
| </button> |
| |
| {showAdvanced && ( |
| <div className="advanced-section"> |
| <div className="form-row"> |
| <div className="form-group" style={{ maxWidth: 200 }}> |
| <label>Load All Documents</label> |
| <Switch checked={loadAll} onChange={setLoadAll} |
| label={loadAll ? "Yes (no limits)" : "No (use filters below)"} /> |
| </div> |
| {!loadAll && ( |
| <> |
| <div className="form-group" style={{ maxWidth: 140 }}> |
| <label>Max Documents</label> |
| <input type="number" value={maxDocs} onChange={e => setMaxDocs(+e.target.value)} |
| min={10} max={100000} /> |
| </div> |
| {source === "raw" && ( |
| <div className="form-group" style={{ maxWidth: 140 }}> |
| <label>Min Text Length</label> |
| <input type="number" value={minTextLen} onChange={e => setMinTextLen(+e.target.value)} |
| min={0} max={10000} /> |
| </div> |
| )} |
| </> |
| )} |
| {source === "raw" && ( |
| <div className="form-group" style={{ maxWidth: 220 }}> |
| <label>Source Filter</label> |
| <Select |
| options={[ |
| { value: "", label: "All sources" }, |
| { value: "TEXT-", label: "TEXT- (native text files)" }, |
| { value: "IMAGES-", label: "IMAGES- (OCR from images)" }, |
| ]} |
| value={sourceFilter} |
| onChange={setSourceFilter} |
| /> |
| </div> |
| )} |
| <div className="form-group" style={{ maxWidth: 200 }}> |
| <label>Build Index</label> |
| <Switch checked={buildIndex} onChange={setBuildIndex} |
| label={buildIndex ? "Yes (ready to search)" : "No (load only)"} /> |
| </div> |
| </div> |
| </div> |
| )} |
|
|
| {loading && ( |
| <StatusMessage type="loading" |
| message="Downloading from HuggingFace and indexing. This may take several minutes for large datasets..." /> |
| )} |
|
|
| <LogViewer active={loading} /> |
| </div> |
|
|
| {error && <StatusMessage type="err" message={error} />} |
|
|
| {} |
| {loadResult && ( |
| <div className="panel"> |
| <h2>Dataset Loaded</h2> |
| <div className="metric-grid mb-2"> |
| {loadResult.documents_loaded !== undefined && ( |
| <MetricCard value={loadResult.documents_loaded} label="Documents" /> |
| )} |
| {loadResult.documents_created !== undefined && ( |
| <MetricCard value={loadResult.documents_created} label="Documents" /> |
| )} |
| {(loadResult.total_chunks || loadResult.chunks_indexed) && ( |
| <MetricCard value={loadResult.total_chunks || loadResult.chunks_indexed || 0} label="Chunks" /> |
| )} |
| {loadResult.chromadb_vectors !== undefined && ( |
| <MetricCard value={loadResult.chromadb_vectors} label="Vectors Imported" /> |
| )} |
| <MetricCard value={`${loadResult.seconds}s`} label="Time" /> |
| </div> |
| <StatusMessage type="ok" |
| message={loadResult.index_built |
| ? "Dataset loaded and FAISS index built. You can now search, analyze keywords, and run evaluations." |
| : "Dataset loaded. Build the index from the Setup tab to enable search."} /> |
| </div> |
| )} |
|
|
| {} |
| {previewDocs.length > 0 && ( |
| <div className="panel"> |
| <h2>Document Preview ({previewDocs.length} docs)</h2> |
| {previewDocs.map((doc, i) => ( |
| <div key={i} className="result-card" style={{ marginBottom: 8 }}> |
| <div className="result-header"> |
| <span style={{ fontWeight: 600, fontSize: "0.85rem" }}>{doc.filename}</span> |
| <span className="badge">{(doc.text_length / 1000).toFixed(1)}K chars</span> |
| </div> |
| <div className="result-text" style={{ whiteSpace: "pre-wrap" }}> |
| {doc.text_preview} |
| </div> |
| </div> |
| ))} |
| </div> |
| )} |
| </div> |
| ); |
| } |
|
|