File size: 9,707 Bytes
db764ae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | import { useState, useEffect } from "react";
import { api, getErrorMessage } from "../api";
import type { DatasetInfo, DatasetLoadResponse, DatasetPreviewDoc } from "../types";
import StatusMessage from "./StatusMessage";
import MetricCard from "./MetricCard";
import Toggle from "./Toggle";
import Select from "./Select";
import Switch from "./Switch";
import LogViewer from "./LogViewer";
interface Props {
onStatsUpdate?: (stats: any) => void;
}
export default function DatasetPanel({ onStatsUpdate }: Props) {
const [info, setInfo] = useState<DatasetInfo | null>(null);
const [error, setError] = useState("");
// Load config
const [source, setSource] = useState<"raw" | "embeddings">("raw");
const [maxDocs, setMaxDocs] = useState(500);
const [minTextLen, setMinTextLen] = useState(100);
const [sourceFilter, setSourceFilter] = useState("");
const [loadAll, setLoadAll] = useState(true);
const [buildIndex, setBuildIndex] = useState(true);
const [loading, setLoading] = useState(false);
const [loadResult, setLoadResult] = useState<DatasetLoadResponse | null>(null);
const [showAdvanced, setShowAdvanced] = useState(false);
// Preview
const [previewDocs, setPreviewDocs] = useState<DatasetPreviewDoc[]>([]);
const [previewLoading, setPreviewLoading] = useState(false);
useEffect(() => {
api.datasetInfo().then(setInfo).catch((err) => {
setError(getErrorMessage(err));
});
}, []);
async function handlePreview() {
setPreviewLoading(true); setError("");
try {
const res = await api.datasetPreview(10, sourceFilter || undefined);
setPreviewDocs(res.documents);
} catch (err) {
setError(getErrorMessage(err));
} finally {
setPreviewLoading(false);
}
}
async function handleLoad() {
setLoading(true); setError(""); setLoadResult(null);
try {
const res = await api.datasetLoad({
source,
max_docs: loadAll ? 100000 : maxDocs,
min_text_length: loadAll ? 0 : minTextLen,
source_filter: sourceFilter || undefined,
build_index: buildIndex,
});
setLoadResult(res);
if (onStatsUpdate) {
try { const s = await api.getStats(); onStatsUpdate(s); } catch (e) {
console.warn("Failed to refresh stats after load:", e);
}
}
} catch (err) {
setError(getErrorMessage(err));
} finally {
setLoading(false);
}
}
return (
<div>
{/* Info */}
<div className="panel">
<h2>Epstein Files Dataset</h2>
<p className="panel-desc">
Load documents from the publicly released U.S. House Oversight Committee Epstein Files
via HuggingFace. Two sources available:
</p>
{info && (
<div style={{ display: "flex", gap: 12, flexWrap: "wrap", marginBottom: 16 }}>
<div className={`result-card ${source === "raw" ? "result-card-selected" : ""}`}
style={{ flex: "1 1 280px", cursor: "pointer" }}
onClick={() => setSource("raw")}>
<div className="result-header">
<strong>Raw Text Documents</strong>
<span className="badge">{info.raw_texts.size_mb} MB</span>
</div>
<div className="result-text">{info.raw_texts.description}</div>
<div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
Columns: {info.raw_texts.columns?.join(", ")}
</div>
</div>
<div className={`result-card ${source === "embeddings" ? "result-card-selected" : ""}`}
style={{ flex: "1 1 280px", cursor: "pointer" }}
onClick={() => setSource("embeddings")}>
<div className="result-header">
<strong>Pre-computed Embeddings</strong>
<span className="badge">{info.embeddings.vector_dim}d</span>
</div>
<div className="result-text">{info.embeddings.description}</div>
<div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
Model: {info.embeddings.model}
</div>
</div>
</div>
)}
<Toggle
options={[
{ value: "raw", label: "Raw Texts" },
{ value: "embeddings", label: "ChromaDB Embeddings" },
]}
value={source}
onChange={(v) => setSource(v as "raw" | "embeddings")}
/>
</div>
{/* Load actions + advanced config */}
<div className="panel">
<h2>Load Dataset</h2>
<div style={{ display: "flex", gap: 8, marginBottom: 12 }}>
<button className="btn btn-primary" onClick={handleLoad}
disabled={loading}>
{loading ? <><span className="spinner" /> Loading Dataset...</> : "Load into Engine"}
</button>
{source === "raw" && (
<button className="btn btn-secondary" onClick={handlePreview}
disabled={previewLoading}>
{previewLoading ? "Loading..." : "Preview Documents"}
</button>
)}
</div>
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
</button>
{showAdvanced && (
<div className="advanced-section">
<div className="form-row">
<div className="form-group" style={{ maxWidth: 200 }}>
<label>Load All Documents</label>
<Switch checked={loadAll} onChange={setLoadAll}
label={loadAll ? "Yes (no limits)" : "No (use filters below)"} />
</div>
{!loadAll && (
<>
<div className="form-group" style={{ maxWidth: 140 }}>
<label>Max Documents</label>
<input type="number" value={maxDocs} onChange={e => setMaxDocs(+e.target.value)}
min={10} max={100000} />
</div>
{source === "raw" && (
<div className="form-group" style={{ maxWidth: 140 }}>
<label>Min Text Length</label>
<input type="number" value={minTextLen} onChange={e => setMinTextLen(+e.target.value)}
min={0} max={10000} />
</div>
)}
</>
)}
{source === "raw" && (
<div className="form-group" style={{ maxWidth: 220 }}>
<label>Source Filter</label>
<Select
options={[
{ value: "", label: "All sources" },
{ value: "TEXT-", label: "TEXT- (native text files)" },
{ value: "IMAGES-", label: "IMAGES- (OCR from images)" },
]}
value={sourceFilter}
onChange={setSourceFilter}
/>
</div>
)}
<div className="form-group" style={{ maxWidth: 200 }}>
<label>Build Index</label>
<Switch checked={buildIndex} onChange={setBuildIndex}
label={buildIndex ? "Yes (ready to search)" : "No (load only)"} />
</div>
</div>
</div>
)}
{loading && (
<StatusMessage type="loading"
message="Downloading from HuggingFace and indexing. This may take several minutes for large datasets..." />
)}
<LogViewer active={loading} />
</div>
{error && <StatusMessage type="err" message={error} />}
{/* Load result */}
{loadResult && (
<div className="panel">
<h2>Dataset Loaded</h2>
<div className="metric-grid mb-2">
{loadResult.documents_loaded !== undefined && (
<MetricCard value={loadResult.documents_loaded} label="Documents" />
)}
{loadResult.documents_created !== undefined && (
<MetricCard value={loadResult.documents_created} label="Documents" />
)}
{(loadResult.total_chunks || loadResult.chunks_indexed) && (
<MetricCard value={loadResult.total_chunks || loadResult.chunks_indexed || 0} label="Chunks" />
)}
{loadResult.chromadb_vectors !== undefined && (
<MetricCard value={loadResult.chromadb_vectors} label="Vectors Imported" />
)}
<MetricCard value={`${loadResult.seconds}s`} label="Time" />
</div>
<StatusMessage type="ok"
message={loadResult.index_built
? "Dataset loaded and FAISS index built. You can now search, analyze keywords, and run evaluations."
: "Dataset loaded. Build the index from the Setup tab to enable search."} />
</div>
)}
{/* Preview */}
{previewDocs.length > 0 && (
<div className="panel">
<h2>Document Preview ({previewDocs.length} docs)</h2>
{previewDocs.map((doc, i) => (
<div key={i} className="result-card" style={{ marginBottom: 8 }}>
<div className="result-header">
<span style={{ fontWeight: 600, fontSize: "0.85rem" }}>{doc.filename}</span>
<span className="badge">{(doc.text_length / 1000).toFixed(1)}K chars</span>
</div>
<div className="result-text" style={{ whiteSpace: "pre-wrap" }}>
{doc.text_preview}
</div>
</div>
))}
</div>
)}
</div>
);
}
|