esfiles / frontend /src /components /DatasetPanel.tsx
Besjon Cifliku
feat: initial project setup
db764ae
import { useState, useEffect } from "react";
import { api, getErrorMessage } from "../api";
import type { DatasetInfo, DatasetLoadResponse, DatasetPreviewDoc } from "../types";
import StatusMessage from "./StatusMessage";
import MetricCard from "./MetricCard";
import Toggle from "./Toggle";
import Select from "./Select";
import Switch from "./Switch";
import LogViewer from "./LogViewer";
interface Props {
onStatsUpdate?: (stats: any) => void;
}
export default function DatasetPanel({ onStatsUpdate }: Props) {
const [info, setInfo] = useState<DatasetInfo | null>(null);
const [error, setError] = useState("");
// Load config
const [source, setSource] = useState<"raw" | "embeddings">("raw");
const [maxDocs, setMaxDocs] = useState(500);
const [minTextLen, setMinTextLen] = useState(100);
const [sourceFilter, setSourceFilter] = useState("");
const [loadAll, setLoadAll] = useState(true);
const [buildIndex, setBuildIndex] = useState(true);
const [loading, setLoading] = useState(false);
const [loadResult, setLoadResult] = useState<DatasetLoadResponse | null>(null);
const [showAdvanced, setShowAdvanced] = useState(false);
// Preview
const [previewDocs, setPreviewDocs] = useState<DatasetPreviewDoc[]>([]);
const [previewLoading, setPreviewLoading] = useState(false);
useEffect(() => {
api.datasetInfo().then(setInfo).catch((err) => {
setError(getErrorMessage(err));
});
}, []);
async function handlePreview() {
setPreviewLoading(true); setError("");
try {
const res = await api.datasetPreview(10, sourceFilter || undefined);
setPreviewDocs(res.documents);
} catch (err) {
setError(getErrorMessage(err));
} finally {
setPreviewLoading(false);
}
}
async function handleLoad() {
setLoading(true); setError(""); setLoadResult(null);
try {
const res = await api.datasetLoad({
source,
max_docs: loadAll ? 100000 : maxDocs,
min_text_length: loadAll ? 0 : minTextLen,
source_filter: sourceFilter || undefined,
build_index: buildIndex,
});
setLoadResult(res);
if (onStatsUpdate) {
try { const s = await api.getStats(); onStatsUpdate(s); } catch (e) {
console.warn("Failed to refresh stats after load:", e);
}
}
} catch (err) {
setError(getErrorMessage(err));
} finally {
setLoading(false);
}
}
return (
<div>
{/* Info */}
<div className="panel">
<h2>Epstein Files Dataset</h2>
<p className="panel-desc">
Load documents from the publicly released U.S. House Oversight Committee Epstein Files
via HuggingFace. Two sources available:
</p>
{info && (
<div style={{ display: "flex", gap: 12, flexWrap: "wrap", marginBottom: 16 }}>
<div className={`result-card ${source === "raw" ? "result-card-selected" : ""}`}
style={{ flex: "1 1 280px", cursor: "pointer" }}
onClick={() => setSource("raw")}>
<div className="result-header">
<strong>Raw Text Documents</strong>
<span className="badge">{info.raw_texts.size_mb} MB</span>
</div>
<div className="result-text">{info.raw_texts.description}</div>
<div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
Columns: {info.raw_texts.columns?.join(", ")}
</div>
</div>
<div className={`result-card ${source === "embeddings" ? "result-card-selected" : ""}`}
style={{ flex: "1 1 280px", cursor: "pointer" }}
onClick={() => setSource("embeddings")}>
<div className="result-header">
<strong>Pre-computed Embeddings</strong>
<span className="badge">{info.embeddings.vector_dim}d</span>
</div>
<div className="result-text">{info.embeddings.description}</div>
<div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
Model: {info.embeddings.model}
</div>
</div>
</div>
)}
<Toggle
options={[
{ value: "raw", label: "Raw Texts" },
{ value: "embeddings", label: "ChromaDB Embeddings" },
]}
value={source}
onChange={(v) => setSource(v as "raw" | "embeddings")}
/>
</div>
{/* Load actions + advanced config */}
<div className="panel">
<h2>Load Dataset</h2>
<div style={{ display: "flex", gap: 8, marginBottom: 12 }}>
<button className="btn btn-primary" onClick={handleLoad}
disabled={loading}>
{loading ? <><span className="spinner" /> Loading Dataset...</> : "Load into Engine"}
</button>
{source === "raw" && (
<button className="btn btn-secondary" onClick={handlePreview}
disabled={previewLoading}>
{previewLoading ? "Loading..." : "Preview Documents"}
</button>
)}
</div>
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
</button>
{showAdvanced && (
<div className="advanced-section">
<div className="form-row">
<div className="form-group" style={{ maxWidth: 200 }}>
<label>Load All Documents</label>
<Switch checked={loadAll} onChange={setLoadAll}
label={loadAll ? "Yes (no limits)" : "No (use filters below)"} />
</div>
{!loadAll && (
<>
<div className="form-group" style={{ maxWidth: 140 }}>
<label>Max Documents</label>
<input type="number" value={maxDocs} onChange={e => setMaxDocs(+e.target.value)}
min={10} max={100000} />
</div>
{source === "raw" && (
<div className="form-group" style={{ maxWidth: 140 }}>
<label>Min Text Length</label>
<input type="number" value={minTextLen} onChange={e => setMinTextLen(+e.target.value)}
min={0} max={10000} />
</div>
)}
</>
)}
{source === "raw" && (
<div className="form-group" style={{ maxWidth: 220 }}>
<label>Source Filter</label>
<Select
options={[
{ value: "", label: "All sources" },
{ value: "TEXT-", label: "TEXT- (native text files)" },
{ value: "IMAGES-", label: "IMAGES- (OCR from images)" },
]}
value={sourceFilter}
onChange={setSourceFilter}
/>
</div>
)}
<div className="form-group" style={{ maxWidth: 200 }}>
<label>Build Index</label>
<Switch checked={buildIndex} onChange={setBuildIndex}
label={buildIndex ? "Yes (ready to search)" : "No (load only)"} />
</div>
</div>
</div>
)}
{loading && (
<StatusMessage type="loading"
message="Downloading from HuggingFace and indexing. This may take several minutes for large datasets..." />
)}
<LogViewer active={loading} />
</div>
{error && <StatusMessage type="err" message={error} />}
{/* Load result */}
{loadResult && (
<div className="panel">
<h2>Dataset Loaded</h2>
<div className="metric-grid mb-2">
{loadResult.documents_loaded !== undefined && (
<MetricCard value={loadResult.documents_loaded} label="Documents" />
)}
{loadResult.documents_created !== undefined && (
<MetricCard value={loadResult.documents_created} label="Documents" />
)}
{(loadResult.total_chunks || loadResult.chunks_indexed) && (
<MetricCard value={loadResult.total_chunks || loadResult.chunks_indexed || 0} label="Chunks" />
)}
{loadResult.chromadb_vectors !== undefined && (
<MetricCard value={loadResult.chromadb_vectors} label="Vectors Imported" />
)}
<MetricCard value={`${loadResult.seconds}s`} label="Time" />
</div>
<StatusMessage type="ok"
message={loadResult.index_built
? "Dataset loaded and FAISS index built. You can now search, analyze keywords, and run evaluations."
: "Dataset loaded. Build the index from the Setup tab to enable search."} />
</div>
)}
{/* Preview */}
{previewDocs.length > 0 && (
<div className="panel">
<h2>Document Preview ({previewDocs.length} docs)</h2>
{previewDocs.map((doc, i) => (
<div key={i} className="result-card" style={{ marginBottom: 8 }}>
<div className="result-header">
<span style={{ fontWeight: 600, fontSize: "0.85rem" }}>{doc.filename}</span>
<span className="badge">{(doc.text_length / 1000).toFixed(1)}K chars</span>
</div>
<div className="result-text" style={{ whiteSpace: "pre-wrap" }}>
{doc.text_preview}
</div>
</div>
))}
</div>
)}
</div>
);
}