Besjon Cifliku commited on
Commit ·
9f87ec0
1
Parent(s): f930251
feat: simplify the workflow and search patterns
Browse files- frontend/src/App.tsx +103 -9
- frontend/src/api.ts +15 -0
- frontend/src/components/DocumentViewer.tsx +84 -0
- frontend/src/components/SemanticSearch.tsx +12 -9
- frontend/src/components/TrainingPanel.tsx +2 -183
- frontend/src/components/Word2VecPanel.tsx +55 -228
- frontend/src/components/Word2VecTools.tsx +193 -0
- frontend/tsconfig.tsbuildinfo +1 -0
- server.py +92 -2
- word2vec_baseline.py +72 -0
frontend/src/App.tsx
CHANGED
|
@@ -10,12 +10,13 @@ import KeywordMatcher from "./components/KeywordMatcher";
|
|
| 10 |
import BatchAnalysis from "./components/BatchAnalysis";
|
| 11 |
import SimilarWords from "./components/SimilarWords";
|
| 12 |
import ContextAnalysis from "./components/ContextAnalysis";
|
| 13 |
-
import EvaluationDashboard from "./components/EvaluationDashboard";
|
| 14 |
import Word2VecPanel from "./components/Word2VecPanel";
|
|
|
|
| 15 |
import DatasetPanel from "./components/DatasetPanel";
|
|
|
|
| 16 |
import "./styles.css";
|
| 17 |
|
| 18 |
-
type NavGroup = "data" | "training" | "analysis"
|
| 19 |
type TrainingTab = "model" | "w2v";
|
| 20 |
type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
|
| 21 |
|
|
@@ -23,7 +24,6 @@ const STEPS: { id: NavGroup; label: string; needsIndex?: boolean }[] = [
|
|
| 23 |
{ id: "data", label: "Data & Setup" },
|
| 24 |
{ id: "training", label: "Training" },
|
| 25 |
{ id: "analysis", label: "Analysis", needsIndex: true },
|
| 26 |
-
{ id: "evaluation", label: "Evaluation", needsIndex: true },
|
| 27 |
];
|
| 28 |
|
| 29 |
const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
|
|
@@ -48,14 +48,22 @@ export default function App() {
|
|
| 48 |
const [stats, setStats] = useState<CorpusStats | null>(null);
|
| 49 |
const [showManualSetup, setShowManualSetup] = useState(false);
|
| 50 |
const [serverError, setServerError] = useState<string | null>(null);
|
|
|
|
|
|
|
|
|
|
| 51 |
const ready = stats !== null && stats.index_built;
|
| 52 |
|
| 53 |
useEffect(() => {
|
| 54 |
checkConnection().then((err) => {
|
| 55 |
setServerError(err);
|
| 56 |
-
// If server is up, try to fetch stats (engine may have been auto-restored)
|
| 57 |
if (!err) {
|
| 58 |
api.getStats().then(setStats).catch(() => {});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
});
|
| 61 |
const interval = setInterval(() => {
|
|
@@ -64,11 +72,99 @@ export default function App() {
|
|
| 64 |
return () => clearInterval(interval);
|
| 65 |
}, []);
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
function handleStepClick(id: NavGroup, needsIndex?: boolean) {
|
| 68 |
if (needsIndex && !ready) return;
|
| 69 |
setGroup(id);
|
| 70 |
}
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
return (
|
| 73 |
<div className="app">
|
| 74 |
<header className="app-header">
|
|
@@ -91,7 +187,7 @@ export default function App() {
|
|
| 91 |
</div>
|
| 92 |
)}
|
| 93 |
|
| 94 |
-
{/* Progress Stepper
|
| 95 |
<nav className="stepper">
|
| 96 |
{STEPS.map((step, i) => {
|
| 97 |
const disabled = step.needsIndex && !ready;
|
|
@@ -119,7 +215,7 @@ export default function App() {
|
|
| 119 |
})}
|
| 120 |
</nav>
|
| 121 |
|
| 122 |
-
{/* Sub-tabs
|
| 123 |
{group === "training" && (
|
| 124 |
<nav className="subtabs">
|
| 125 |
{TRAINING_TABS.map((t) => (
|
|
@@ -165,7 +261,7 @@ export default function App() {
|
|
| 165 |
)}
|
| 166 |
|
| 167 |
{group === "training" && trainingTab === "model" && <TrainingPanel />}
|
| 168 |
-
{group === "training" && trainingTab === "w2v" && <Word2VecPanel />}
|
| 169 |
|
| 170 |
{group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
|
| 171 |
{group === "analysis" && analysisTab === "words" && <SimilarWords />}
|
|
@@ -174,8 +270,6 @@ export default function App() {
|
|
| 174 |
{group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
|
| 175 |
{group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
|
| 176 |
{group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
|
| 177 |
-
|
| 178 |
-
{group === "evaluation" && <EvaluationDashboard />}
|
| 179 |
</main>
|
| 180 |
</div>
|
| 181 |
);
|
|
|
|
| 10 |
import BatchAnalysis from "./components/BatchAnalysis";
|
| 11 |
import SimilarWords from "./components/SimilarWords";
|
| 12 |
import ContextAnalysis from "./components/ContextAnalysis";
|
|
|
|
| 13 |
import Word2VecPanel from "./components/Word2VecPanel";
|
| 14 |
+
import Word2VecTools from "./components/Word2VecTools";
|
| 15 |
import DatasetPanel from "./components/DatasetPanel";
|
| 16 |
+
import MetricCard from "./components/MetricCard";
|
| 17 |
import "./styles.css";
|
| 18 |
|
| 19 |
+
type NavGroup = "data" | "training" | "analysis";
|
| 20 |
type TrainingTab = "model" | "w2v";
|
| 21 |
type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
|
| 22 |
|
|
|
|
| 24 |
{ id: "data", label: "Data & Setup" },
|
| 25 |
{ id: "training", label: "Training" },
|
| 26 |
{ id: "analysis", label: "Analysis", needsIndex: true },
|
|
|
|
| 27 |
];
|
| 28 |
|
| 29 |
const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
|
|
|
|
| 48 |
const [stats, setStats] = useState<CorpusStats | null>(null);
|
| 49 |
const [showManualSetup, setShowManualSetup] = useState(false);
|
| 50 |
const [serverError, setServerError] = useState<string | null>(null);
|
| 51 |
+
const [w2vReady, setW2vReady] = useState(false);
|
| 52 |
+
const [w2vInfo, setW2vInfo] = useState<{ vocab_size: number; sentences: number; vector_size: number } | null>(null);
|
| 53 |
+
const [resetLoading, setResetLoading] = useState(false);
|
| 54 |
const ready = stats !== null && stats.index_built;
|
| 55 |
|
| 56 |
useEffect(() => {
|
| 57 |
checkConnection().then((err) => {
|
| 58 |
setServerError(err);
|
|
|
|
| 59 |
if (!err) {
|
| 60 |
api.getStats().then(setStats).catch(() => {});
|
| 61 |
+
api.w2vStatus().then(res => {
|
| 62 |
+
if (res.ready) {
|
| 63 |
+
setW2vReady(true);
|
| 64 |
+
setW2vInfo({ vocab_size: res.vocab_size!, sentences: res.sentences!, vector_size: res.vector_size! });
|
| 65 |
+
}
|
| 66 |
+
}).catch(() => {});
|
| 67 |
}
|
| 68 |
});
|
| 69 |
const interval = setInterval(() => {
|
|
|
|
| 72 |
return () => clearInterval(interval);
|
| 73 |
}, []);
|
| 74 |
|
| 75 |
+
function handleW2vReady(ready: boolean, info?: { vocab_size: number; sentences: number; vector_size: number }) {
|
| 76 |
+
setW2vReady(ready);
|
| 77 |
+
setW2vInfo(ready && info ? info : null);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
async function handleReset() {
|
| 81 |
+
setResetLoading(true);
|
| 82 |
+
try {
|
| 83 |
+
await api.w2vReset();
|
| 84 |
+
setW2vReady(false);
|
| 85 |
+
setW2vInfo(null);
|
| 86 |
+
} catch {
|
| 87 |
+
// ignore
|
| 88 |
+
} finally {
|
| 89 |
+
setResetLoading(false);
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
function handleStepClick(id: NavGroup, needsIndex?: boolean) {
|
| 94 |
if (needsIndex && !ready) return;
|
| 95 |
setGroup(id);
|
| 96 |
}
|
| 97 |
|
| 98 |
+
// ── W2V trained: stats bar + analysis tabs, no stepper ──
|
| 99 |
+
if (w2vReady && w2vInfo) {
|
| 100 |
+
return (
|
| 101 |
+
<div className="app">
|
| 102 |
+
<header className="app-header">
|
| 103 |
+
<h1>Contextual Similarity Engine</h1>
|
| 104 |
+
{stats && (
|
| 105 |
+
<div className="header-stats">
|
| 106 |
+
<span className="badge">{stats.model_name}</span>
|
| 107 |
+
<span className="badge">{stats.total_documents} docs</span>
|
| 108 |
+
<span className="badge">{stats.total_chunks} chunks</span>
|
| 109 |
+
</div>
|
| 110 |
+
)}
|
| 111 |
+
</header>
|
| 112 |
+
|
| 113 |
+
{serverError && (
|
| 114 |
+
<div className="server-error-banner">
|
| 115 |
+
<strong>Server unavailable:</strong> {serverError}
|
| 116 |
+
</div>
|
| 117 |
+
)}
|
| 118 |
+
|
| 119 |
+
{/* W2V stats bar */}
|
| 120 |
+
<div className="content">
|
| 121 |
+
<div className="panel">
|
| 122 |
+
<div style={{ display: "flex", alignItems: "center", justifyContent: "space-between", flexWrap: "wrap", gap: 12 }}>
|
| 123 |
+
<h2 style={{ margin: 0 }}>Word2Vec Baseline</h2>
|
| 124 |
+
<button className="btn btn-secondary" onClick={handleReset} disabled={resetLoading}
|
| 125 |
+
style={{ fontSize: "0.85em" }}>
|
| 126 |
+
{resetLoading ? "Resetting..." : "Reset & Retrain"}
|
| 127 |
+
</button>
|
| 128 |
+
</div>
|
| 129 |
+
<div className="metric-grid" style={{ marginTop: 12 }}>
|
| 130 |
+
<MetricCard value={w2vInfo.vocab_size} label="Vocabulary" />
|
| 131 |
+
<MetricCard value={w2vInfo.sentences} label="Sentences" />
|
| 132 |
+
<MetricCard value={w2vInfo.vector_size} label="Dimensions" />
|
| 133 |
+
</div>
|
| 134 |
+
</div>
|
| 135 |
+
|
| 136 |
+
{/* W2V-specific tools: Similar Words, Compare, Semantic Search */}
|
| 137 |
+
<Word2VecTools />
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
{/* Transformer Analysis sub-tabs */}
|
| 141 |
+
<nav className="subtabs">
|
| 142 |
+
{ANALYSIS_TABS.map((t) => (
|
| 143 |
+
<button
|
| 144 |
+
key={t.id}
|
| 145 |
+
className={`subtab ${analysisTab === t.id ? "subtab-active" : ""}`}
|
| 146 |
+
onClick={() => setAnalysisTab(t.id)}
|
| 147 |
+
>
|
| 148 |
+
{t.label}
|
| 149 |
+
</button>
|
| 150 |
+
))}
|
| 151 |
+
</nav>
|
| 152 |
+
|
| 153 |
+
{/* Analysis content */}
|
| 154 |
+
<main className="content">
|
| 155 |
+
{analysisTab === "context" && <ContextAnalysis />}
|
| 156 |
+
{analysisTab === "words" && <SimilarWords />}
|
| 157 |
+
{analysisTab === "search" && <SemanticSearch />}
|
| 158 |
+
{analysisTab === "compare" && <TextCompare />}
|
| 159 |
+
{analysisTab === "keyword" && <KeywordAnalysis />}
|
| 160 |
+
{analysisTab === "match" && <KeywordMatcher />}
|
| 161 |
+
{analysisTab === "batch" && <BatchAnalysis />}
|
| 162 |
+
</main>
|
| 163 |
+
</div>
|
| 164 |
+
);
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
// ── Normal stepper flow ──
|
| 168 |
return (
|
| 169 |
<div className="app">
|
| 170 |
<header className="app-header">
|
|
|
|
| 187 |
</div>
|
| 188 |
)}
|
| 189 |
|
| 190 |
+
{/* Progress Stepper */}
|
| 191 |
<nav className="stepper">
|
| 192 |
{STEPS.map((step, i) => {
|
| 193 |
const disabled = step.needsIndex && !ready;
|
|
|
|
| 215 |
})}
|
| 216 |
</nav>
|
| 217 |
|
| 218 |
+
{/* Sub-tabs */}
|
| 219 |
{group === "training" && (
|
| 220 |
<nav className="subtabs">
|
| 221 |
{TRAINING_TABS.map((t) => (
|
|
|
|
| 261 |
)}
|
| 262 |
|
| 263 |
{group === "training" && trainingTab === "model" && <TrainingPanel />}
|
| 264 |
+
{group === "training" && trainingTab === "w2v" && <Word2VecPanel onReady={handleW2vReady} />}
|
| 265 |
|
| 266 |
{group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
|
| 267 |
{group === "analysis" && analysisTab === "words" && <SimilarWords />}
|
|
|
|
| 270 |
{group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
|
| 271 |
{group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
|
| 272 |
{group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
|
|
|
|
|
|
|
| 273 |
</main>
|
| 274 |
</div>
|
| 275 |
);
|
frontend/src/api.ts
CHANGED
|
@@ -110,6 +110,9 @@ export const api = {
|
|
| 110 |
getCorpusTexts: (maxDocs: number = 500) =>
|
| 111 |
client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
|
| 112 |
|
|
|
|
|
|
|
|
|
|
| 113 |
// ---- Engine persistence ----
|
| 114 |
saveEngine: () =>
|
| 115 |
client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
|
|
@@ -131,6 +134,18 @@ export const api = {
|
|
| 131 |
w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
|
| 132 |
client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
w2vCompare: (data: { text_a: string; text_b: string }) =>
|
| 135 |
client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
|
| 136 |
|
|
|
|
| 110 |
getCorpusTexts: (maxDocs: number = 500) =>
|
| 111 |
client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
|
| 112 |
|
| 113 |
+
getDocument: (docId: string) =>
|
| 114 |
+
client.get<{ doc_id: string; text: string; num_chunks: number }>(`/documents/${encodeURIComponent(docId)}`).then(r => r.data),
|
| 115 |
+
|
| 116 |
// ---- Engine persistence ----
|
| 117 |
saveEngine: () =>
|
| 118 |
client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
|
|
|
|
| 134 |
w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
|
| 135 |
client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
|
| 136 |
|
| 137 |
+
w2vInitFromEngine: (params?: { vector_size?: number; window?: number; epochs?: number }) =>
|
| 138 |
+
client.post<W2VInitResponse & { documents_used: number }>(`/w2v/init-from-engine`, null, {
|
| 139 |
+
...long,
|
| 140 |
+
params: { ...(_sign && { __sign: _sign }), ...params },
|
| 141 |
+
}).then(r => r.data),
|
| 142 |
+
|
| 143 |
+
w2vStatus: () =>
|
| 144 |
+
client.get<{ ready: boolean; vocab_size?: number; sentences?: number; vector_size?: number; has_saved_state?: boolean }>("/w2v/status").then(r => r.data),
|
| 145 |
+
|
| 146 |
+
w2vReset: () =>
|
| 147 |
+
client.post<{ status: string; message: string }>("/w2v/reset").then(r => r.data),
|
| 148 |
+
|
| 149 |
w2vCompare: (data: { text_a: string; text_b: string }) =>
|
| 150 |
client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
|
| 151 |
|
frontend/src/components/DocumentViewer.tsx
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, type ReactNode } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
|
| 4 |
+
interface Props {
|
| 5 |
+
docId: string;
|
| 6 |
+
children: ReactNode;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
export default function DocumentViewer({ docId, children }: Props) {
|
| 10 |
+
const [expanded, setExpanded] = useState(false);
|
| 11 |
+
const [fullText, setFullText] = useState<string | null>(null);
|
| 12 |
+
const [loading, setLoading] = useState(false);
|
| 13 |
+
const [error, setError] = useState("");
|
| 14 |
+
|
| 15 |
+
async function handleClick() {
|
| 16 |
+
if (expanded) {
|
| 17 |
+
setExpanded(false);
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
if (fullText !== null) {
|
| 21 |
+
setExpanded(true);
|
| 22 |
+
return;
|
| 23 |
+
}
|
| 24 |
+
setLoading(true); setError("");
|
| 25 |
+
try {
|
| 26 |
+
const res = await api.getDocument(docId);
|
| 27 |
+
setFullText(res.text);
|
| 28 |
+
setExpanded(true);
|
| 29 |
+
} catch (err) {
|
| 30 |
+
setError(getErrorMessage(err));
|
| 31 |
+
} finally {
|
| 32 |
+
setLoading(false);
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
return (
|
| 37 |
+
<div>
|
| 38 |
+
<div onClick={handleClick}>
|
| 39 |
+
{children}
|
| 40 |
+
</div>
|
| 41 |
+
{loading && (
|
| 42 |
+
<div style={{ padding: "8px 12px", color: "var(--text-dim)", fontSize: "0.85rem" }}>
|
| 43 |
+
Loading document...
|
| 44 |
+
</div>
|
| 45 |
+
)}
|
| 46 |
+
{error && (
|
| 47 |
+
<div style={{ padding: "8px 12px", color: "var(--danger)", fontSize: "0.85rem" }}>
|
| 48 |
+
{error}
|
| 49 |
+
</div>
|
| 50 |
+
)}
|
| 51 |
+
{expanded && fullText !== null && (
|
| 52 |
+
<div style={{
|
| 53 |
+
background: "var(--bg-elevated, #0d1117)",
|
| 54 |
+
border: "1px solid var(--border)",
|
| 55 |
+
borderTop: "none",
|
| 56 |
+
borderRadius: "0 0 var(--radius) var(--radius)",
|
| 57 |
+
padding: "12px 16px",
|
| 58 |
+
marginTop: -4,
|
| 59 |
+
marginBottom: 8,
|
| 60 |
+
maxHeight: 400,
|
| 61 |
+
overflowY: "auto",
|
| 62 |
+
fontSize: "0.82rem",
|
| 63 |
+
lineHeight: 1.7,
|
| 64 |
+
whiteSpace: "pre-wrap",
|
| 65 |
+
wordBreak: "break-word",
|
| 66 |
+
color: "var(--text)",
|
| 67 |
+
}}>
|
| 68 |
+
<div style={{ display: "flex", justifyContent: "space-between", marginBottom: 8 }}>
|
| 69 |
+
<span style={{ fontWeight: 600 }}>{docId}</span>
|
| 70 |
+
<button
|
| 71 |
+
onClick={(e) => { e.stopPropagation(); setExpanded(false); }}
|
| 72 |
+
style={{
|
| 73 |
+
background: "none", border: "none", color: "var(--text-dim)",
|
| 74 |
+
cursor: "pointer", fontSize: "0.8rem",
|
| 75 |
+
}}>
|
| 76 |
+
Close
|
| 77 |
+
</button>
|
| 78 |
+
</div>
|
| 79 |
+
{fullText}
|
| 80 |
+
</div>
|
| 81 |
+
)}
|
| 82 |
+
</div>
|
| 83 |
+
);
|
| 84 |
+
}
|
frontend/src/components/SemanticSearch.tsx
CHANGED
|
@@ -4,6 +4,7 @@ import type { QueryResultItem } from "../types";
|
|
| 4 |
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
import ScoreBar from "./ScoreBar";
|
| 6 |
import StatusMessage from "./StatusMessage";
|
|
|
|
| 7 |
|
| 8 |
export default function SemanticSearch() {
|
| 9 |
const [query, setQuery] = useState("");
|
|
@@ -51,17 +52,19 @@ export default function SemanticSearch() {
|
|
| 51 |
<div className="panel">
|
| 52 |
<h3>Results ({results.length})</h3>
|
| 53 |
{results.map((r) => (
|
| 54 |
-
<
|
| 55 |
-
<div className="result-
|
| 56 |
-
<div>
|
| 57 |
-
<
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
</div>
|
| 61 |
-
<
|
| 62 |
</div>
|
| 63 |
-
|
| 64 |
-
</div>
|
| 65 |
))}
|
| 66 |
</div>
|
| 67 |
)}
|
|
|
|
| 4 |
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
import ScoreBar from "./ScoreBar";
|
| 6 |
import StatusMessage from "./StatusMessage";
|
| 7 |
+
import DocumentViewer from "./DocumentViewer";
|
| 8 |
|
| 9 |
export default function SemanticSearch() {
|
| 10 |
const [query, setQuery] = useState("");
|
|
|
|
| 52 |
<div className="panel">
|
| 53 |
<h3>Results ({results.length})</h3>
|
| 54 |
{results.map((r) => (
|
| 55 |
+
<DocumentViewer key={`${r.doc_id}-${r.chunk_index}`} docId={r.doc_id}>
|
| 56 |
+
<div className="result-card" style={{ cursor: "pointer" }}>
|
| 57 |
+
<div className="result-header">
|
| 58 |
+
<div>
|
| 59 |
+
<span className="badge">#{r.rank}</span>{" "}
|
| 60 |
+
<span className="badge">{r.doc_id}</span>{" "}
|
| 61 |
+
<span className="tag">chunk {r.chunk_index}</span>
|
| 62 |
+
</div>
|
| 63 |
+
<ScoreBar score={r.score} />
|
| 64 |
</div>
|
| 65 |
+
<div className="result-text">{r.text}</div>
|
| 66 |
</div>
|
| 67 |
+
</DocumentViewer>
|
|
|
|
| 68 |
))}
|
| 69 |
</div>
|
| 70 |
)}
|
frontend/src/components/TrainingPanel.tsx
CHANGED
|
@@ -1,9 +1,7 @@
|
|
| 1 |
import { useState } from "react";
|
| 2 |
import { api, getErrorMessage } from "../api";
|
| 3 |
-
import type { TrainResponse
|
| 4 |
import { useCorpusLoader } from "../hooks/useCorpusLoader";
|
| 5 |
-
import { scoreColor } from "../utils/colors";
|
| 6 |
-
import ScoreBar from "./ScoreBar";
|
| 7 |
import StatusMessage from "./StatusMessage";
|
| 8 |
import MetricCard from "./MetricCard";
|
| 9 |
import Toggle from "./Toggle";
|
|
@@ -12,11 +10,6 @@ import LogViewer from "./LogViewer";
|
|
| 12 |
|
| 13 |
type Strategy = "unsupervised" | "contrastive" | "keywords";
|
| 14 |
|
| 15 |
-
interface SimilarWord {
|
| 16 |
-
word: string;
|
| 17 |
-
score: number;
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
|
| 21 |
{ id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
|
| 22 |
{ id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
|
|
@@ -42,26 +35,6 @@ export default function TrainingPanel() {
|
|
| 42 |
|
| 43 |
const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
|
| 44 |
|
| 45 |
-
// Similar words
|
| 46 |
-
const [simWord, setSimWord] = useState("");
|
| 47 |
-
const [simTopK, setSimTopK] = useState(10);
|
| 48 |
-
const [simResults, setSimResults] = useState<SimilarWord[]>([]);
|
| 49 |
-
const [simLoading, setSimLoading] = useState(false);
|
| 50 |
-
|
| 51 |
-
// Compare
|
| 52 |
-
const [compTextA, setCompTextA] = useState("");
|
| 53 |
-
const [compTextB, setCompTextB] = useState("");
|
| 54 |
-
const [compResult, setCompResult] = useState<CompareResponse | null>(null);
|
| 55 |
-
const [compLoading, setCompLoading] = useState(false);
|
| 56 |
-
|
| 57 |
-
// Search
|
| 58 |
-
const [queryText, setQueryText] = useState("");
|
| 59 |
-
const [queryTopK, setQueryTopK] = useState(5);
|
| 60 |
-
const [queryResults, setQueryResults] = useState<QueryResultItem[]>([]);
|
| 61 |
-
const [queryLoading, setQueryLoading] = useState(false);
|
| 62 |
-
|
| 63 |
-
const ready = result !== null;
|
| 64 |
-
|
| 65 |
async function handleTrain() {
|
| 66 |
setTraining(true); setError(""); setResult(null);
|
| 67 |
try {
|
|
@@ -87,42 +60,6 @@ export default function TrainingPanel() {
|
|
| 87 |
}
|
| 88 |
}
|
| 89 |
|
| 90 |
-
async function handleSimilarWords() {
|
| 91 |
-
setSimLoading(true); setError("");
|
| 92 |
-
try {
|
| 93 |
-
const res = await api.similarWords({ word: simWord, top_k: simTopK });
|
| 94 |
-
setSimResults(res.similar);
|
| 95 |
-
} catch (err) {
|
| 96 |
-
setError(getErrorMessage(err));
|
| 97 |
-
} finally {
|
| 98 |
-
setSimLoading(false);
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
|
| 102 |
-
async function handleCompare() {
|
| 103 |
-
setCompLoading(true); setError("");
|
| 104 |
-
try {
|
| 105 |
-
const res = await api.compare({ text_a: compTextA, text_b: compTextB });
|
| 106 |
-
setCompResult(res);
|
| 107 |
-
} catch (err) {
|
| 108 |
-
setError(getErrorMessage(err));
|
| 109 |
-
} finally {
|
| 110 |
-
setCompLoading(false);
|
| 111 |
-
}
|
| 112 |
-
}
|
| 113 |
-
|
| 114 |
-
async function handleQuery() {
|
| 115 |
-
setQueryLoading(true); setError("");
|
| 116 |
-
try {
|
| 117 |
-
const res = await api.query({ text: queryText, top_k: queryTopK });
|
| 118 |
-
setQueryResults(res.results);
|
| 119 |
-
} catch (err) {
|
| 120 |
-
setError(getErrorMessage(err));
|
| 121 |
-
} finally {
|
| 122 |
-
setQueryLoading(false);
|
| 123 |
-
}
|
| 124 |
-
}
|
| 125 |
-
|
| 126 |
return (
|
| 127 |
<div>
|
| 128 |
{/* 1. Training (strategy + config + corpus merged) */}
|
|
@@ -223,127 +160,9 @@ export default function TrainingPanel() {
|
|
| 223 |
<MetricCard value={`${result.seconds}s`} label="Time" />
|
| 224 |
</div>
|
| 225 |
<StatusMessage type="ok"
|
| 226 |
-
message={`Model saved: ${result.model_path} — use this path in the Setup tab.`} />
|
| 227 |
</div>
|
| 228 |
)}
|
| 229 |
-
|
| 230 |
-
{/* 2. Similar Words */}
|
| 231 |
-
<div className="panel">
|
| 232 |
-
<h2>2. Similar Words</h2>
|
| 233 |
-
<p className="panel-desc">
|
| 234 |
-
Find words that appear in similar contexts using transformer embeddings.
|
| 235 |
-
</p>
|
| 236 |
-
<div className="form-row">
|
| 237 |
-
<div className="form-group">
|
| 238 |
-
<label>Word</label>
|
| 239 |
-
<input value={simWord} onChange={e => setSimWord(e.target.value)}
|
| 240 |
-
onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
|
| 241 |
-
placeholder="e.g. pizza" />
|
| 242 |
-
</div>
|
| 243 |
-
<div className="form-group form-group-sm">
|
| 244 |
-
<label>Top K</label>
|
| 245 |
-
<input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
|
| 246 |
-
</div>
|
| 247 |
-
<div className="form-group form-group-sm">
|
| 248 |
-
<label> </label>
|
| 249 |
-
<button className="btn btn-primary" onClick={handleSimilarWords}
|
| 250 |
-
disabled={simLoading || !simWord.trim()}>
|
| 251 |
-
{simLoading ? "Searching..." : "Find"}
|
| 252 |
-
</button>
|
| 253 |
-
</div>
|
| 254 |
-
</div>
|
| 255 |
-
|
| 256 |
-
{simResults.length > 0 && (
|
| 257 |
-
<table className="data-table" style={{ marginTop: 12 }}>
|
| 258 |
-
<thead>
|
| 259 |
-
<tr><th>Word</th><th>Similarity</th></tr>
|
| 260 |
-
</thead>
|
| 261 |
-
<tbody>
|
| 262 |
-
{simResults.map((r, i) => (
|
| 263 |
-
<tr key={i}>
|
| 264 |
-
<td style={{ fontWeight: 600 }}>{r.word}</td>
|
| 265 |
-
<td><ScoreBar score={r.score} /></td>
|
| 266 |
-
</tr>
|
| 267 |
-
))}
|
| 268 |
-
</tbody>
|
| 269 |
-
</table>
|
| 270 |
-
)}
|
| 271 |
-
</div>
|
| 272 |
-
|
| 273 |
-
{/* 3. Compare Texts */}
|
| 274 |
-
<div className="panel">
|
| 275 |
-
<h2>3. Compare Texts</h2>
|
| 276 |
-
<p className="panel-desc">
|
| 277 |
-
Sentence similarity via transformer contextual embeddings.
|
| 278 |
-
</p>
|
| 279 |
-
<div className="form-row">
|
| 280 |
-
<div className="form-group">
|
| 281 |
-
<label>Text A</label>
|
| 282 |
-
<input value={compTextA} onChange={e => setCompTextA(e.target.value)}
|
| 283 |
-
placeholder="pizza gives me homework" />
|
| 284 |
-
</div>
|
| 285 |
-
<div className="form-group">
|
| 286 |
-
<label>Text B</label>
|
| 287 |
-
<input value={compTextB} onChange={e => setCompTextB(e.target.value)}
|
| 288 |
-
placeholder="school gives me homework" />
|
| 289 |
-
</div>
|
| 290 |
-
</div>
|
| 291 |
-
<button className="btn btn-primary" onClick={handleCompare}
|
| 292 |
-
disabled={compLoading || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
|
| 293 |
-
{compLoading ? "Comparing..." : "Compare"}
|
| 294 |
-
</button>
|
| 295 |
-
|
| 296 |
-
{compResult && (
|
| 297 |
-
<div className="similarity-gauge" style={{ marginTop: 16 }}>
|
| 298 |
-
<div className="similarity-value"
|
| 299 |
-
style={{ color: scoreColor(compResult.similarity) }}>
|
| 300 |
-
{compResult.similarity.toFixed(4)}
|
| 301 |
-
</div>
|
| 302 |
-
<div className="similarity-label">Transformer Cosine Similarity</div>
|
| 303 |
-
</div>
|
| 304 |
-
)}
|
| 305 |
-
</div>
|
| 306 |
-
|
| 307 |
-
{/* 4. Semantic Search */}
|
| 308 |
-
<div className="panel">
|
| 309 |
-
<h2>4. Semantic Search</h2>
|
| 310 |
-
<p className="panel-desc">
|
| 311 |
-
Search your corpus using transformer embeddings.
|
| 312 |
-
</p>
|
| 313 |
-
<div className="form-row">
|
| 314 |
-
<div className="form-group">
|
| 315 |
-
<label>Query</label>
|
| 316 |
-
<input value={queryText} onChange={e => setQueryText(e.target.value)}
|
| 317 |
-
onKeyDown={e => e.key === "Enter" && handleQuery()}
|
| 318 |
-
placeholder="a place where children learn" />
|
| 319 |
-
</div>
|
| 320 |
-
<div className="form-group form-group-sm">
|
| 321 |
-
<label>Top K</label>
|
| 322 |
-
<input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
|
| 323 |
-
</div>
|
| 324 |
-
<div className="form-group form-group-sm">
|
| 325 |
-
<label> </label>
|
| 326 |
-
<button className="btn btn-primary" onClick={handleQuery}
|
| 327 |
-
disabled={queryLoading || !queryText.trim()}>
|
| 328 |
-
{queryLoading ? "Searching..." : "Search"}
|
| 329 |
-
</button>
|
| 330 |
-
</div>
|
| 331 |
-
</div>
|
| 332 |
-
|
| 333 |
-
{queryResults.length > 0 && (
|
| 334 |
-
<div style={{ marginTop: 12 }}>
|
| 335 |
-
{queryResults.map((r, i) => (
|
| 336 |
-
<div key={i} className="result-card">
|
| 337 |
-
<div className="result-header">
|
| 338 |
-
<span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
|
| 339 |
-
<ScoreBar score={r.score} />
|
| 340 |
-
</div>
|
| 341 |
-
<div className="result-text">{r.text}</div>
|
| 342 |
-
</div>
|
| 343 |
-
))}
|
| 344 |
-
</div>
|
| 345 |
-
)}
|
| 346 |
-
</div>
|
| 347 |
</div>
|
| 348 |
);
|
| 349 |
}
|
|
|
|
| 1 |
import { useState } from "react";
|
| 2 |
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { TrainResponse } from "../types";
|
| 4 |
import { useCorpusLoader } from "../hooks/useCorpusLoader";
|
|
|
|
|
|
|
| 5 |
import StatusMessage from "./StatusMessage";
|
| 6 |
import MetricCard from "./MetricCard";
|
| 7 |
import Toggle from "./Toggle";
|
|
|
|
| 10 |
|
| 11 |
type Strategy = "unsupervised" | "contrastive" | "keywords";
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
|
| 14 |
{ id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
|
| 15 |
{ id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
|
|
|
|
| 35 |
|
| 36 |
const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
async function handleTrain() {
|
| 39 |
setTraining(true); setError(""); setResult(null);
|
| 40 |
try {
|
|
|
|
| 60 |
}
|
| 61 |
}
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
return (
|
| 64 |
<div>
|
| 65 |
{/* 1. Training (strategy + config + corpus merged) */}
|
|
|
|
| 160 |
<MetricCard value={`${result.seconds}s`} label="Time" />
|
| 161 |
</div>
|
| 162 |
<StatusMessage type="ok"
|
| 163 |
+
message={`Model saved: ${result.model_path} — use this path in the Setup tab, then go to Analysis to explore results.`} />
|
| 164 |
</div>
|
| 165 |
)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
</div>
|
| 167 |
);
|
| 168 |
}
|
frontend/src/components/Word2VecPanel.tsx
CHANGED
|
@@ -1,54 +1,43 @@
|
|
| 1 |
-
import { useState } from "react";
|
| 2 |
import { api, getErrorMessage } from "../api";
|
| 3 |
-
import type { W2VInitResponse
|
| 4 |
-
import { useCorpusLoader } from "../hooks/useCorpusLoader";
|
| 5 |
-
import { scoreColor } from "../utils/colors";
|
| 6 |
-
import ScoreBar from "./ScoreBar";
|
| 7 |
import StatusMessage from "./StatusMessage";
|
| 8 |
import LogViewer from "./LogViewer";
|
| 9 |
import MetricCard from "./MetricCard";
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
const [vectorSize, setVectorSize] = useState(100);
|
| 14 |
const [windowSize, setWindowSize] = useState(5);
|
| 15 |
const [w2vEpochs, setW2vEpochs] = useState(50);
|
| 16 |
const [showAdvanced, setShowAdvanced] = useState(false);
|
| 17 |
const [initLoading, setInitLoading] = useState(false);
|
| 18 |
-
const [
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
const [compResult, setCompResult] = useState<CompareResponse | null>(null);
|
| 32 |
-
const [compLoading, setCompLoading] = useState(false);
|
| 33 |
-
|
| 34 |
-
// Query
|
| 35 |
-
const [queryText, setQueryText] = useState("");
|
| 36 |
-
const [queryTopK, setQueryTopK] = useState(5);
|
| 37 |
-
const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
|
| 38 |
-
const [queryLoading, setQueryLoading] = useState(false);
|
| 39 |
-
|
| 40 |
-
async function handleInit() {
|
| 41 |
-
setInitLoading(true); setError(""); setInitResult(null);
|
| 42 |
try {
|
| 43 |
-
const
|
| 44 |
-
if (!corpus.length) { setError("Corpus is empty."); setInitLoading(false); return; }
|
| 45 |
-
const res = await api.w2vInit({
|
| 46 |
-
corpus_texts: corpus,
|
| 47 |
vector_size: vectorSize,
|
| 48 |
window: windowSize,
|
| 49 |
epochs: w2vEpochs,
|
| 50 |
});
|
| 51 |
-
|
| 52 |
} catch (err) {
|
| 53 |
setError(getErrorMessage(err));
|
| 54 |
} finally {
|
|
@@ -56,76 +45,44 @@ export default function Word2VecPanel() {
|
|
| 56 |
}
|
| 57 |
}
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
try {
|
| 62 |
-
const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
|
| 63 |
-
setSimResults(res.similar);
|
| 64 |
-
} catch (err) {
|
| 65 |
-
setError(getErrorMessage(err));
|
| 66 |
-
} finally {
|
| 67 |
-
setSimLoading(false);
|
| 68 |
-
}
|
| 69 |
}
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
|
| 87 |
-
setQueryResults(res.results);
|
| 88 |
-
} catch (err) {
|
| 89 |
-
setError(getErrorMessage(err));
|
| 90 |
-
} finally {
|
| 91 |
-
setQueryLoading(false);
|
| 92 |
-
}
|
| 93 |
}
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
return (
|
| 98 |
<div>
|
| 99 |
-
{/* 1. Training */}
|
| 100 |
<div className="panel">
|
| 101 |
-
<h2>
|
| 102 |
<p className="panel-desc">
|
| 103 |
Static embeddings — one vector per word, no context awareness.
|
| 104 |
-
|
| 105 |
</p>
|
| 106 |
-
|
| 107 |
-
<button className="btn btn-secondary" onClick={loadFromEngine}
|
| 108 |
-
disabled={corpusLoading}>
|
| 109 |
-
{corpusLoading ? "Loading..." : "Load from Engine"}
|
| 110 |
-
</button>
|
| 111 |
-
{corpusText && (
|
| 112 |
-
<button className="btn btn-secondary" onClick={() => setCorpusText("")}>
|
| 113 |
-
Clear
|
| 114 |
-
</button>
|
| 115 |
-
)}
|
| 116 |
-
</div>
|
| 117 |
-
<div className="form-group" style={{ marginBottom: 12 }}>
|
| 118 |
-
<label>
|
| 119 |
-
Corpus (separate documents with blank lines)
|
| 120 |
-
{corpusText && (
|
| 121 |
-
<span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
|
| 122 |
-
{" "} — {parseCorpus().length} documents detected
|
| 123 |
-
</span>
|
| 124 |
-
)}
|
| 125 |
-
</label>
|
| 126 |
-
<textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
|
| 127 |
-
placeholder="Document 1 text...\n\nDocument 2 text..." />
|
| 128 |
-
</div>
|
| 129 |
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
|
| 130 |
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
|
| 131 |
</button>
|
|
@@ -149,145 +106,15 @@ export default function Word2VecPanel() {
|
|
| 149 |
</div>
|
| 150 |
)}
|
| 151 |
|
| 152 |
-
<button className="btn btn-primary" onClick={
|
| 153 |
-
disabled={initLoading
|
| 154 |
-
{initLoading ? <><span className="spinner" /> Training...</> : "Train Word2Vec"}
|
| 155 |
</button>
|
| 156 |
|
| 157 |
<LogViewer active={initLoading} />
|
| 158 |
</div>
|
| 159 |
|
| 160 |
{error && <StatusMessage type="err" message={error} />}
|
| 161 |
-
|
| 162 |
-
{initResult && (
|
| 163 |
-
<div className="panel">
|
| 164 |
-
<h2>Word2Vec Ready</h2>
|
| 165 |
-
<div className="metric-grid">
|
| 166 |
-
<MetricCard value={initResult.vocab_size} label="Vocabulary" />
|
| 167 |
-
<MetricCard value={initResult.sentences} label="Sentences" />
|
| 168 |
-
<MetricCard value={initResult.vector_size} label="Dimensions" />
|
| 169 |
-
<MetricCard value={`${initResult.seconds}s`} label="Time" />
|
| 170 |
-
</div>
|
| 171 |
-
</div>
|
| 172 |
-
)}
|
| 173 |
-
|
| 174 |
-
{/* 2. Similar Words */}
|
| 175 |
-
<div className="panel">
|
| 176 |
-
<h2>2. Similar Words</h2>
|
| 177 |
-
<p className="panel-desc">
|
| 178 |
-
Find words that appear in similar contexts using Word2Vec static embeddings.
|
| 179 |
-
</p>
|
| 180 |
-
<div className="form-row">
|
| 181 |
-
<div className="form-group">
|
| 182 |
-
<label>Word</label>
|
| 183 |
-
<input value={simWord} onChange={e => setSimWord(e.target.value)}
|
| 184 |
-
onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
|
| 185 |
-
placeholder="e.g. pizza" />
|
| 186 |
-
</div>
|
| 187 |
-
<div className="form-group form-group-sm">
|
| 188 |
-
<label>Top K</label>
|
| 189 |
-
<input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
|
| 190 |
-
</div>
|
| 191 |
-
<div className="form-group form-group-sm">
|
| 192 |
-
<label> </label>
|
| 193 |
-
<button className="btn btn-primary" onClick={handleSimilarWords}
|
| 194 |
-
disabled={simLoading || !ready || !simWord.trim()}>
|
| 195 |
-
{simLoading ? "Searching..." : "Find"}
|
| 196 |
-
</button>
|
| 197 |
-
</div>
|
| 198 |
-
</div>
|
| 199 |
-
|
| 200 |
-
{simResults.length > 0 && (
|
| 201 |
-
<table className="data-table" style={{ marginTop: 12 }}>
|
| 202 |
-
<thead>
|
| 203 |
-
<tr><th>Word</th><th>Similarity</th></tr>
|
| 204 |
-
</thead>
|
| 205 |
-
<tbody>
|
| 206 |
-
{simResults.map((r, i) => (
|
| 207 |
-
<tr key={i}>
|
| 208 |
-
<td style={{ fontWeight: 600 }}>{r.word}</td>
|
| 209 |
-
<td><ScoreBar score={r.score} /></td>
|
| 210 |
-
</tr>
|
| 211 |
-
))}
|
| 212 |
-
</tbody>
|
| 213 |
-
</table>
|
| 214 |
-
)}
|
| 215 |
-
</div>
|
| 216 |
-
|
| 217 |
-
{/* 3. Compare Texts */}
|
| 218 |
-
<div className="panel">
|
| 219 |
-
<h2>3. Compare Texts</h2>
|
| 220 |
-
<p className="panel-desc">
|
| 221 |
-
Sentence similarity via averaged word vectors.
|
| 222 |
-
</p>
|
| 223 |
-
<div className="form-row">
|
| 224 |
-
<div className="form-group">
|
| 225 |
-
<label>Text A</label>
|
| 226 |
-
<input value={compTextA} onChange={e => setCompTextA(e.target.value)}
|
| 227 |
-
placeholder="pizza gives me homework" />
|
| 228 |
-
</div>
|
| 229 |
-
<div className="form-group">
|
| 230 |
-
<label>Text B</label>
|
| 231 |
-
<input value={compTextB} onChange={e => setCompTextB(e.target.value)}
|
| 232 |
-
placeholder="school gives me homework" />
|
| 233 |
-
</div>
|
| 234 |
-
</div>
|
| 235 |
-
<button className="btn btn-primary" onClick={handleCompare}
|
| 236 |
-
disabled={compLoading || !ready || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
|
| 237 |
-
{compLoading ? "Comparing..." : "Compare"}
|
| 238 |
-
</button>
|
| 239 |
-
|
| 240 |
-
{compResult && (
|
| 241 |
-
<div className="similarity-gauge" style={{ marginTop: 16 }}>
|
| 242 |
-
<div className="similarity-value"
|
| 243 |
-
style={{ color: scoreColor(compResult.similarity) }}>
|
| 244 |
-
{compResult.similarity.toFixed(4)}
|
| 245 |
-
</div>
|
| 246 |
-
<div className="similarity-label">Word2Vec Cosine Similarity</div>
|
| 247 |
-
</div>
|
| 248 |
-
)}
|
| 249 |
-
</div>
|
| 250 |
-
|
| 251 |
-
{/* 4. Semantic Search */}
|
| 252 |
-
<div className="panel">
|
| 253 |
-
<h2>4. Semantic Search</h2>
|
| 254 |
-
<p className="panel-desc">
|
| 255 |
-
Search your corpus using averaged Word2Vec vectors.
|
| 256 |
-
</p>
|
| 257 |
-
<div className="form-row">
|
| 258 |
-
<div className="form-group">
|
| 259 |
-
<label>Query</label>
|
| 260 |
-
<input value={queryText} onChange={e => setQueryText(e.target.value)}
|
| 261 |
-
onKeyDown={e => e.key === "Enter" && handleQuery()}
|
| 262 |
-
placeholder="a place where children learn" />
|
| 263 |
-
</div>
|
| 264 |
-
<div className="form-group form-group-sm">
|
| 265 |
-
<label>Top K</label>
|
| 266 |
-
<input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
|
| 267 |
-
</div>
|
| 268 |
-
<div className="form-group form-group-sm">
|
| 269 |
-
<label> </label>
|
| 270 |
-
<button className="btn btn-primary" onClick={handleQuery}
|
| 271 |
-
disabled={queryLoading || !ready || !queryText.trim()}>
|
| 272 |
-
{queryLoading ? "Searching..." : "Search"}
|
| 273 |
-
</button>
|
| 274 |
-
</div>
|
| 275 |
-
</div>
|
| 276 |
-
|
| 277 |
-
{queryResults.length > 0 && (
|
| 278 |
-
<div style={{ marginTop: 12 }}>
|
| 279 |
-
{queryResults.map((r, i) => (
|
| 280 |
-
<div key={i} className="result-card">
|
| 281 |
-
<div className="result-header">
|
| 282 |
-
<span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
|
| 283 |
-
<ScoreBar score={r.score} />
|
| 284 |
-
</div>
|
| 285 |
-
<div className="result-text">{r.text}</div>
|
| 286 |
-
</div>
|
| 287 |
-
))}
|
| 288 |
-
</div>
|
| 289 |
-
)}
|
| 290 |
-
</div>
|
| 291 |
</div>
|
| 292 |
);
|
| 293 |
}
|
|
|
|
| 1 |
+
import { useState, useEffect } from "react";
|
| 2 |
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { W2VInitResponse } from "../types";
|
|
|
|
|
|
|
|
|
|
| 4 |
import StatusMessage from "./StatusMessage";
|
| 5 |
import LogViewer from "./LogViewer";
|
| 6 |
import MetricCard from "./MetricCard";
|
| 7 |
|
| 8 |
+
interface Props {
|
| 9 |
+
onReady: (ready: boolean, info?: { vocab_size: number; sentences: number; vector_size: number }) => void;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
export default function Word2VecPanel({ onReady }: Props) {
|
| 13 |
+
const [statusChecked, setStatusChecked] = useState(false);
|
| 14 |
+
const [trainResult, setTrainResult] = useState<W2VInitResponse | null>(null);
|
| 15 |
+
|
| 16 |
const [vectorSize, setVectorSize] = useState(100);
|
| 17 |
const [windowSize, setWindowSize] = useState(5);
|
| 18 |
const [w2vEpochs, setW2vEpochs] = useState(50);
|
| 19 |
const [showAdvanced, setShowAdvanced] = useState(false);
|
| 20 |
const [initLoading, setInitLoading] = useState(false);
|
| 21 |
+
const [error, setError] = useState("");
|
| 22 |
+
|
| 23 |
+
useEffect(() => {
|
| 24 |
+
api.w2vStatus().then(res => {
|
| 25 |
+
if (res.ready) {
|
| 26 |
+
onReady(true, { vocab_size: res.vocab_size!, sentences: res.sentences!, vector_size: res.vector_size! });
|
| 27 |
+
}
|
| 28 |
+
setStatusChecked(true);
|
| 29 |
+
}).catch(() => setStatusChecked(true));
|
| 30 |
+
}, []);
|
| 31 |
+
|
| 32 |
+
async function handleTrainFromEngine() {
|
| 33 |
+
setInitLoading(true); setError(""); setTrainResult(null);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
try {
|
| 35 |
+
const res = await api.w2vInitFromEngine({
|
|
|
|
|
|
|
|
|
|
| 36 |
vector_size: vectorSize,
|
| 37 |
window: windowSize,
|
| 38 |
epochs: w2vEpochs,
|
| 39 |
});
|
| 40 |
+
setTrainResult(res);
|
| 41 |
} catch (err) {
|
| 42 |
setError(getErrorMessage(err));
|
| 43 |
} finally {
|
|
|
|
| 45 |
}
|
| 46 |
}
|
| 47 |
|
| 48 |
+
if (!statusChecked) {
|
| 49 |
+
return <div className="panel"><p>Checking Word2Vec status...</p></div>;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
}
|
| 51 |
|
| 52 |
+
// Training complete — show results + continue button
|
| 53 |
+
if (trainResult) {
|
| 54 |
+
return (
|
| 55 |
+
<div>
|
| 56 |
+
<div className="panel">
|
| 57 |
+
<h2>Training Complete</h2>
|
| 58 |
+
<div className="metric-grid">
|
| 59 |
+
<MetricCard value={trainResult.vocab_size} label="Vocabulary" />
|
| 60 |
+
<MetricCard value={trainResult.sentences} label="Sentences" />
|
| 61 |
+
<MetricCard value={trainResult.vector_size} label="Dimensions" />
|
| 62 |
+
<MetricCard value={`${trainResult.seconds}s`} label="Train Time" />
|
| 63 |
+
</div>
|
| 64 |
+
<StatusMessage type="ok" message="Word2Vec model trained and saved. It will persist across restarts." />
|
| 65 |
+
<button className="btn btn-primary" style={{ marginTop: 12 }}
|
| 66 |
+
onClick={() => onReady(true, { vocab_size: trainResult.vocab_size, sentences: trainResult.sentences, vector_size: trainResult.vector_size })}>
|
| 67 |
+
Continue to Analysis
|
| 68 |
+
</button>
|
| 69 |
+
</div>
|
| 70 |
|
| 71 |
+
<LogViewer active={false} />
|
| 72 |
+
</div>
|
| 73 |
+
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
}
|
| 75 |
|
| 76 |
+
// Training form
|
|
|
|
| 77 |
return (
|
| 78 |
<div>
|
|
|
|
| 79 |
<div className="panel">
|
| 80 |
+
<h2>Word2Vec Baseline (gensim)</h2>
|
| 81 |
<p className="panel-desc">
|
| 82 |
Static embeddings — one vector per word, no context awareness.
|
| 83 |
+
Train on all documents loaded in the engine to use as a baseline comparison.
|
| 84 |
</p>
|
| 85 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
|
| 87 |
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
|
| 88 |
</button>
|
|
|
|
| 106 |
</div>
|
| 107 |
)}
|
| 108 |
|
| 109 |
+
<button className="btn btn-primary" onClick={handleTrainFromEngine}
|
| 110 |
+
disabled={initLoading} style={{ marginTop: 8 }}>
|
| 111 |
+
{initLoading ? <><span className="spinner" /> Training on all engine documents...</> : "Train Word2Vec"}
|
| 112 |
</button>
|
| 113 |
|
| 114 |
<LogViewer active={initLoading} />
|
| 115 |
</div>
|
| 116 |
|
| 117 |
{error && <StatusMessage type="err" message={error} />}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
</div>
|
| 119 |
);
|
| 120 |
}
|
frontend/src/components/Word2VecTools.tsx
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { W2VQueryResult, W2VSimilarWord, CompareResponse } from "../types";
|
| 4 |
+
import { scoreColor } from "../utils/colors";
|
| 5 |
+
import ScoreBar from "./ScoreBar";
|
| 6 |
+
import StatusMessage from "./StatusMessage";
|
| 7 |
+
import DocumentViewer from "./DocumentViewer";
|
| 8 |
+
|
| 9 |
+
export default function Word2VecTools() {
|
| 10 |
+
const [error, setError] = useState("");
|
| 11 |
+
|
| 12 |
+
// Similar words
|
| 13 |
+
const [simWord, setSimWord] = useState("");
|
| 14 |
+
const [simTopK, setSimTopK] = useState(10);
|
| 15 |
+
const [simResults, setSimResults] = useState<W2VSimilarWord[]>([]);
|
| 16 |
+
const [simLoading, setSimLoading] = useState(false);
|
| 17 |
+
|
| 18 |
+
// Compare
|
| 19 |
+
const [compTextA, setCompTextA] = useState("");
|
| 20 |
+
const [compTextB, setCompTextB] = useState("");
|
| 21 |
+
const [compResult, setCompResult] = useState<CompareResponse | null>(null);
|
| 22 |
+
const [compLoading, setCompLoading] = useState(false);
|
| 23 |
+
|
| 24 |
+
// Search
|
| 25 |
+
const [queryText, setQueryText] = useState("");
|
| 26 |
+
const [queryTopK, setQueryTopK] = useState(5);
|
| 27 |
+
const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
|
| 28 |
+
const [queryLoading, setQueryLoading] = useState(false);
|
| 29 |
+
|
| 30 |
+
async function handleSimilarWords() {
|
| 31 |
+
setSimLoading(true); setError("");
|
| 32 |
+
try {
|
| 33 |
+
const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
|
| 34 |
+
setSimResults(res.similar);
|
| 35 |
+
} catch (err) {
|
| 36 |
+
setError(getErrorMessage(err));
|
| 37 |
+
} finally {
|
| 38 |
+
setSimLoading(false);
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
async function handleCompare() {
|
| 43 |
+
setCompLoading(true); setError("");
|
| 44 |
+
try {
|
| 45 |
+
const res = await api.w2vCompare({ text_a: compTextA, text_b: compTextB });
|
| 46 |
+
setCompResult(res);
|
| 47 |
+
} catch (err) {
|
| 48 |
+
setError(getErrorMessage(err));
|
| 49 |
+
} finally {
|
| 50 |
+
setCompLoading(false);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
async function handleQuery() {
|
| 55 |
+
setQueryLoading(true); setError("");
|
| 56 |
+
try {
|
| 57 |
+
const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
|
| 58 |
+
setQueryResults(res.results);
|
| 59 |
+
} catch (err) {
|
| 60 |
+
setError(getErrorMessage(err));
|
| 61 |
+
} finally {
|
| 62 |
+
setQueryLoading(false);
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
return (
|
| 67 |
+
<div>
|
| 68 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 69 |
+
|
| 70 |
+
<div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: 16 }}>
|
| 71 |
+
{/* Similar Words */}
|
| 72 |
+
<div className="panel">
|
| 73 |
+
<h3 style={{ marginTop: 0 }}>Similar Words</h3>
|
| 74 |
+
<p className="panel-desc">
|
| 75 |
+
Find words that appear in similar contexts using Word2Vec static embeddings.
|
| 76 |
+
</p>
|
| 77 |
+
<div className="form-row">
|
| 78 |
+
<div className="form-group">
|
| 79 |
+
<label>Word</label>
|
| 80 |
+
<input value={simWord} onChange={e => setSimWord(e.target.value)}
|
| 81 |
+
onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
|
| 82 |
+
placeholder="e.g. pizza" />
|
| 83 |
+
</div>
|
| 84 |
+
<div className="form-group form-group-sm">
|
| 85 |
+
<label>Top K</label>
|
| 86 |
+
<input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)}
|
| 87 |
+
min={1} max={50} style={{ width: 60 }} />
|
| 88 |
+
</div>
|
| 89 |
+
<div className="form-group form-group-sm">
|
| 90 |
+
<label> </label>
|
| 91 |
+
<button className="btn btn-primary" onClick={handleSimilarWords}
|
| 92 |
+
disabled={simLoading || !simWord.trim()}>
|
| 93 |
+
{simLoading ? "..." : "Find"}
|
| 94 |
+
</button>
|
| 95 |
+
</div>
|
| 96 |
+
</div>
|
| 97 |
+
|
| 98 |
+
{simResults.length > 0 && (
|
| 99 |
+
<table className="data-table" style={{ marginTop: 8 }}>
|
| 100 |
+
<thead>
|
| 101 |
+
<tr><th>Word</th><th>Similarity</th></tr>
|
| 102 |
+
</thead>
|
| 103 |
+
<tbody>
|
| 104 |
+
{simResults.map((r, i) => (
|
| 105 |
+
<tr key={i}>
|
| 106 |
+
<td style={{ fontWeight: 600 }}>{r.word}</td>
|
| 107 |
+
<td><ScoreBar score={r.score} /></td>
|
| 108 |
+
</tr>
|
| 109 |
+
))}
|
| 110 |
+
</tbody>
|
| 111 |
+
</table>
|
| 112 |
+
)}
|
| 113 |
+
</div>
|
| 114 |
+
|
| 115 |
+
{/* Compare Texts */}
|
| 116 |
+
<div className="panel">
|
| 117 |
+
<h3 style={{ marginTop: 0 }}>Compare Texts</h3>
|
| 118 |
+
<p className="panel-desc">
|
| 119 |
+
Sentence similarity via averaged word vectors.
|
| 120 |
+
</p>
|
| 121 |
+
<div className="form-group" style={{ marginBottom: 8 }}>
|
| 122 |
+
<label>Text A</label>
|
| 123 |
+
<input value={compTextA} onChange={e => setCompTextA(e.target.value)}
|
| 124 |
+
placeholder="pizza gives me homework" />
|
| 125 |
+
</div>
|
| 126 |
+
<div className="form-group" style={{ marginBottom: 8 }}>
|
| 127 |
+
<label>Text B</label>
|
| 128 |
+
<input value={compTextB} onChange={e => setCompTextB(e.target.value)}
|
| 129 |
+
placeholder="school gives me homework" />
|
| 130 |
+
</div>
|
| 131 |
+
<button className="btn btn-primary" onClick={handleCompare}
|
| 132 |
+
disabled={compLoading || !compTextA.trim() || !compTextB.trim()}>
|
| 133 |
+
{compLoading ? "..." : "Compare"}
|
| 134 |
+
</button>
|
| 135 |
+
|
| 136 |
+
{compResult && (
|
| 137 |
+
<div className="similarity-gauge" style={{ marginTop: 12 }}>
|
| 138 |
+
<div className="similarity-value"
|
| 139 |
+
style={{ color: scoreColor(compResult.similarity) }}>
|
| 140 |
+
{compResult.similarity.toFixed(4)}
|
| 141 |
+
</div>
|
| 142 |
+
<div className="similarity-label">Word2Vec Cosine Similarity</div>
|
| 143 |
+
</div>
|
| 144 |
+
)}
|
| 145 |
+
</div>
|
| 146 |
+
</div>
|
| 147 |
+
|
| 148 |
+
{/* Semantic Search — full width */}
|
| 149 |
+
<div className="panel">
|
| 150 |
+
<h3 style={{ marginTop: 0 }}>Semantic Search</h3>
|
| 151 |
+
<p className="panel-desc">
|
| 152 |
+
Search your corpus using averaged Word2Vec vectors.
|
| 153 |
+
</p>
|
| 154 |
+
<div className="form-row">
|
| 155 |
+
<div className="form-group" style={{ flex: 1 }}>
|
| 156 |
+
<label>Query</label>
|
| 157 |
+
<input value={queryText} onChange={e => setQueryText(e.target.value)}
|
| 158 |
+
onKeyDown={e => e.key === "Enter" && handleQuery()}
|
| 159 |
+
placeholder="a place where children learn" />
|
| 160 |
+
</div>
|
| 161 |
+
<div className="form-group form-group-sm">
|
| 162 |
+
<label>Top K</label>
|
| 163 |
+
<input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)}
|
| 164 |
+
min={1} max={20} style={{ width: 60 }} />
|
| 165 |
+
</div>
|
| 166 |
+
<div className="form-group form-group-sm">
|
| 167 |
+
<label> </label>
|
| 168 |
+
<button className="btn btn-primary" onClick={handleQuery}
|
| 169 |
+
disabled={queryLoading || !queryText.trim()}>
|
| 170 |
+
{queryLoading ? "Searching..." : "Search"}
|
| 171 |
+
</button>
|
| 172 |
+
</div>
|
| 173 |
+
</div>
|
| 174 |
+
|
| 175 |
+
{queryResults.length > 0 && (
|
| 176 |
+
<div style={{ marginTop: 8 }}>
|
| 177 |
+
{queryResults.map((r, i) => (
|
| 178 |
+
<DocumentViewer key={i} docId={r.doc_id}>
|
| 179 |
+
<div className="result-card" style={{ cursor: "pointer" }}>
|
| 180 |
+
<div className="result-header">
|
| 181 |
+
<span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
|
| 182 |
+
<ScoreBar score={r.score} />
|
| 183 |
+
</div>
|
| 184 |
+
<div className="result-text">{r.text}</div>
|
| 185 |
+
</div>
|
| 186 |
+
</DocumentViewer>
|
| 187 |
+
))}
|
| 188 |
+
</div>
|
| 189 |
+
)}
|
| 190 |
+
</div>
|
| 191 |
+
</div>
|
| 192 |
+
);
|
| 193 |
+
}
|
frontend/tsconfig.tsbuildinfo
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"root":["./src/app.tsx","./src/api.ts","./src/main.tsx","./src/types.ts","./src/vite-env.d.ts","./src/components/batchanalysis.tsx","./src/components/contextanalysis.tsx","./src/components/datasetpanel.tsx","./src/components/documentviewer.tsx","./src/components/enginesetup.tsx","./src/components/evaluationdashboard.tsx","./src/components/keywordanalysis.tsx","./src/components/keywordmatcher.tsx","./src/components/logviewer.tsx","./src/components/metriccard.tsx","./src/components/scorebar.tsx","./src/components/select.tsx","./src/components/semanticsearch.tsx","./src/components/similarwords.tsx","./src/components/statusmessage.tsx","./src/components/switch.tsx","./src/components/textcompare.tsx","./src/components/toggle.tsx","./src/components/trainingpanel.tsx","./src/components/word2vecpanel.tsx","./src/components/word2vectools.tsx","./src/hooks/useapicall.ts","./src/hooks/usecorpusloader.ts","./src/utils/colors.ts"],"version":"5.9.3"}
|
server.py
CHANGED
|
@@ -145,12 +145,13 @@ evaluator: Optional[Evaluator] = None
|
|
| 145 |
w2v_engine: Optional[Word2VecEngine] = None
|
| 146 |
|
| 147 |
ENGINE_SAVE_DIR = Path(os.environ.get("ENGINE_STATE_DIR", str(BASE_DIR / "engine_state")))
|
|
|
|
| 148 |
|
| 149 |
|
| 150 |
@app.on_event("startup")
|
| 151 |
def _auto_restore():
|
| 152 |
-
"""Restore engine state from disk if
|
| 153 |
-
global engine, evaluator
|
| 154 |
if (ENGINE_SAVE_DIR / "meta.json").is_file():
|
| 155 |
try:
|
| 156 |
engine = ContextualSimilarityEngine.load(str(ENGINE_SAVE_DIR))
|
|
@@ -160,6 +161,13 @@ def _auto_restore():
|
|
| 160 |
len(engine.chunks), len(engine._doc_ids))
|
| 161 |
except Exception:
|
| 162 |
logger.exception("Failed to auto-restore engine state — starting fresh")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
|
| 165 |
@app.get("/api/logs/stream")
|
|
@@ -572,6 +580,18 @@ def get_corpus_texts(max_docs: int = Query(default=500, ge=1, le=10_000)):
|
|
| 572 |
return {"documents": result, "count": len(result)}
|
| 573 |
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
@app.post("/api/engine/save")
|
| 576 |
def save_engine():
|
| 577 |
"""Save current engine state to disk for later restore."""
|
|
@@ -615,9 +635,53 @@ def w2v_init(req: W2VInitRequest):
|
|
| 615 |
stats = w2v_engine.build_index()
|
| 616 |
elapsed = round(time.time() - t0, 2)
|
| 617 |
logger.info("Word2Vec ready: %s in %.2fs", stats, elapsed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
return {**stats, "seconds": elapsed}
|
| 619 |
|
| 620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
@app.post("/api/w2v/compare")
|
| 622 |
def w2v_compare(req: W2VCompareRequest):
|
| 623 |
_ensure_w2v()
|
|
@@ -642,6 +706,32 @@ def w2v_similar_words(req: W2VWordRequest):
|
|
| 642 |
return {"word": req.word, "similar": [{"word": w, "score": round(s, 4)} for w, s in similar]}
|
| 643 |
|
| 644 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
# ------------------------------------------------------------------ #
|
| 646 |
# Dataset endpoints (HuggingFace Epstein Files)
|
| 647 |
# ------------------------------------------------------------------ #
|
|
|
|
| 145 |
w2v_engine: Optional[Word2VecEngine] = None
|
| 146 |
|
| 147 |
ENGINE_SAVE_DIR = Path(os.environ.get("ENGINE_STATE_DIR", str(BASE_DIR / "engine_state")))
|
| 148 |
+
W2V_SAVE_DIR = Path(os.environ.get("W2V_STATE_DIR", str(BASE_DIR / "w2v_state")))
|
| 149 |
|
| 150 |
|
| 151 |
@app.on_event("startup")
|
| 152 |
def _auto_restore():
|
| 153 |
+
"""Restore engine and W2V state from disk if previous saves exist."""
|
| 154 |
+
global engine, evaluator, w2v_engine
|
| 155 |
if (ENGINE_SAVE_DIR / "meta.json").is_file():
|
| 156 |
try:
|
| 157 |
engine = ContextualSimilarityEngine.load(str(ENGINE_SAVE_DIR))
|
|
|
|
| 161 |
len(engine.chunks), len(engine._doc_ids))
|
| 162 |
except Exception:
|
| 163 |
logger.exception("Failed to auto-restore engine state — starting fresh")
|
| 164 |
+
if Word2VecEngine.has_saved_state(str(W2V_SAVE_DIR)):
|
| 165 |
+
try:
|
| 166 |
+
w2v_engine = Word2VecEngine.load(str(W2V_SAVE_DIR))
|
| 167 |
+
logger.info("Auto-restored Word2Vec: %d sentences, %d vocab",
|
| 168 |
+
len(w2v_engine.sentences), len(w2v_engine.model.wv))
|
| 169 |
+
except Exception:
|
| 170 |
+
logger.exception("Failed to auto-restore Word2Vec state — starting fresh")
|
| 171 |
|
| 172 |
|
| 173 |
@app.get("/api/logs/stream")
|
|
|
|
| 580 |
return {"documents": result, "count": len(result)}
|
| 581 |
|
| 582 |
|
| 583 |
+
@app.get("/api/documents/{doc_id}")
|
| 584 |
+
def get_document(doc_id: str):
|
| 585 |
+
"""Return the full text of a document by reconstructing its chunks."""
|
| 586 |
+
_ensure_engine()
|
| 587 |
+
chunks = [c for c in engine.chunks if c.doc_id == doc_id]
|
| 588 |
+
if not chunks:
|
| 589 |
+
raise HTTPException(404, f"Document '{doc_id}' not found.")
|
| 590 |
+
chunks.sort(key=lambda c: c.chunk_index)
|
| 591 |
+
full_text = "\n".join(c.text for c in chunks)
|
| 592 |
+
return {"doc_id": doc_id, "text": full_text, "num_chunks": len(chunks)}
|
| 593 |
+
|
| 594 |
+
|
| 595 |
@app.post("/api/engine/save")
|
| 596 |
def save_engine():
|
| 597 |
"""Save current engine state to disk for later restore."""
|
|
|
|
| 635 |
stats = w2v_engine.build_index()
|
| 636 |
elapsed = round(time.time() - t0, 2)
|
| 637 |
logger.info("Word2Vec ready: %s in %.2fs", stats, elapsed)
|
| 638 |
+
# Auto-save so data persists across restarts
|
| 639 |
+
try:
|
| 640 |
+
w2v_engine.save(str(W2V_SAVE_DIR))
|
| 641 |
+
except Exception:
|
| 642 |
+
logger.warning("Auto-save W2V after init failed", exc_info=True)
|
| 643 |
return {**stats, "seconds": elapsed}
|
| 644 |
|
| 645 |
|
| 646 |
+
@app.post("/api/w2v/init-from-engine")
|
| 647 |
+
def w2v_init_from_engine(
|
| 648 |
+
vector_size: int = Query(default=100, ge=50, le=500),
|
| 649 |
+
window: int = Query(default=5, ge=1, le=20),
|
| 650 |
+
epochs: int = Query(default=50, ge=1, le=200),
|
| 651 |
+
):
|
| 652 |
+
"""Train Word2Vec directly from all documents already loaded in the engine.
|
| 653 |
+
|
| 654 |
+
This avoids the round-trip through the frontend and uses ALL engine docs.
|
| 655 |
+
"""
|
| 656 |
+
global w2v_engine
|
| 657 |
+
_ensure_engine()
|
| 658 |
+
if not engine.chunks:
|
| 659 |
+
raise HTTPException(400, "No documents in the engine. Load a dataset first.")
|
| 660 |
+
|
| 661 |
+
# Group chunks by doc_id to reconstruct full documents
|
| 662 |
+
docs: dict[str, list[str]] = {}
|
| 663 |
+
for chunk in engine.chunks:
|
| 664 |
+
if chunk.doc_id not in docs:
|
| 665 |
+
docs[chunk.doc_id] = []
|
| 666 |
+
docs[chunk.doc_id].append(chunk.text)
|
| 667 |
+
|
| 668 |
+
logger.info("Word2Vec init from engine: %d documents, vector_size=%d, window=%d, epochs=%d",
|
| 669 |
+
len(docs), vector_size, window, epochs)
|
| 670 |
+
t0 = time.time()
|
| 671 |
+
w2v_engine = Word2VecEngine(vector_size=vector_size, window=window, epochs=epochs)
|
| 672 |
+
for doc_id, chunks_list in docs.items():
|
| 673 |
+
w2v_engine.add_document(doc_id, "\n".join(chunks_list))
|
| 674 |
+
stats = w2v_engine.build_index()
|
| 675 |
+
elapsed = round(time.time() - t0, 2)
|
| 676 |
+
logger.info("Word2Vec ready: %s in %.2fs", stats, elapsed)
|
| 677 |
+
# Auto-save
|
| 678 |
+
try:
|
| 679 |
+
w2v_engine.save(str(W2V_SAVE_DIR))
|
| 680 |
+
except Exception:
|
| 681 |
+
logger.warning("Auto-save W2V after init failed", exc_info=True)
|
| 682 |
+
return {**stats, "seconds": elapsed, "documents_used": len(docs)}
|
| 683 |
+
|
| 684 |
+
|
| 685 |
@app.post("/api/w2v/compare")
|
| 686 |
def w2v_compare(req: W2VCompareRequest):
|
| 687 |
_ensure_w2v()
|
|
|
|
| 706 |
return {"word": req.word, "similar": [{"word": w, "score": round(s, 4)} for w, s in similar]}
|
| 707 |
|
| 708 |
|
| 709 |
+
@app.get("/api/w2v/status")
|
| 710 |
+
def w2v_status():
|
| 711 |
+
"""Check if Word2Vec is loaded (from training or restored from disk)."""
|
| 712 |
+
if w2v_engine is not None and w2v_engine.model is not None:
|
| 713 |
+
return {
|
| 714 |
+
"ready": True,
|
| 715 |
+
"vocab_size": len(w2v_engine.model.wv),
|
| 716 |
+
"sentences": len(w2v_engine.sentences),
|
| 717 |
+
"vector_size": w2v_engine.vector_size,
|
| 718 |
+
}
|
| 719 |
+
has_saved = Word2VecEngine.has_saved_state(str(W2V_SAVE_DIR))
|
| 720 |
+
return {"ready": False, "has_saved_state": has_saved}
|
| 721 |
+
|
| 722 |
+
|
| 723 |
+
@app.post("/api/w2v/reset")
|
| 724 |
+
def w2v_reset():
|
| 725 |
+
"""Delete saved Word2Vec state and clear the in-memory model."""
|
| 726 |
+
global w2v_engine
|
| 727 |
+
w2v_engine = None
|
| 728 |
+
import shutil
|
| 729 |
+
if W2V_SAVE_DIR.is_dir():
|
| 730 |
+
shutil.rmtree(str(W2V_SAVE_DIR))
|
| 731 |
+
logger.info("Word2Vec state deleted from %s", W2V_SAVE_DIR)
|
| 732 |
+
return {"status": "ok", "message": "Word2Vec state cleared. You can retrain now."}
|
| 733 |
+
|
| 734 |
+
|
| 735 |
# ------------------------------------------------------------------ #
|
| 736 |
# Dataset endpoints (HuggingFace Epstein Files)
|
| 737 |
# ------------------------------------------------------------------ #
|
word2vec_baseline.py
CHANGED
|
@@ -16,9 +16,11 @@ Usage:
|
|
| 16 |
score = w2v.compare_texts("pizza gives me homework", "school gives me homework")
|
| 17 |
"""
|
| 18 |
|
|
|
|
| 19 |
import re
|
| 20 |
import logging
|
| 21 |
from dataclasses import dataclass
|
|
|
|
| 22 |
from typing import Optional
|
| 23 |
|
| 24 |
import numpy as np
|
|
@@ -145,6 +147,76 @@ class Word2VecEngine:
|
|
| 145 |
return 0.0
|
| 146 |
return float(self.model.wv.similarity(a, b))
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
# ------------------------------------------------------------------ #
|
| 149 |
|
| 150 |
def _sentence_vector(self, tokens: list[str]) -> np.ndarray:
|
|
|
|
| 16 |
score = w2v.compare_texts("pizza gives me homework", "school gives me homework")
|
| 17 |
"""
|
| 18 |
|
| 19 |
+
import json
|
| 20 |
import re
|
| 21 |
import logging
|
| 22 |
from dataclasses import dataclass
|
| 23 |
+
from pathlib import Path
|
| 24 |
from typing import Optional
|
| 25 |
|
| 26 |
import numpy as np
|
|
|
|
| 147 |
return 0.0
|
| 148 |
return float(self.model.wv.similarity(a, b))
|
| 149 |
|
| 150 |
+
# ------------------------------------------------------------------ #
|
| 151 |
+
# Persistence
|
| 152 |
+
# ------------------------------------------------------------------ #
|
| 153 |
+
|
| 154 |
+
def save(self, directory: str) -> dict:
|
| 155 |
+
"""Save trained Word2Vec state to disk for later restore."""
|
| 156 |
+
save_dir = Path(directory)
|
| 157 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 158 |
+
|
| 159 |
+
if self.model is None:
|
| 160 |
+
raise RuntimeError("Cannot save: model has not been trained yet.")
|
| 161 |
+
|
| 162 |
+
self.model.save(str(save_dir / "w2v.model"))
|
| 163 |
+
np.save(save_dir / "sentence_vecs.npy", self.sentence_vecs)
|
| 164 |
+
|
| 165 |
+
meta = {
|
| 166 |
+
"vector_size": self.vector_size,
|
| 167 |
+
"window": self.window,
|
| 168 |
+
"min_count": self.min_count,
|
| 169 |
+
"epochs": self.epochs,
|
| 170 |
+
"sg": self.sg,
|
| 171 |
+
"num_sentences": len(self.sentences),
|
| 172 |
+
"vocab_size": len(self.model.wv),
|
| 173 |
+
}
|
| 174 |
+
with open(save_dir / "w2v_meta.json", "w") as f:
|
| 175 |
+
json.dump(meta, f, indent=2)
|
| 176 |
+
|
| 177 |
+
# Save sentences and their doc mappings
|
| 178 |
+
with open(save_dir / "w2v_sentences.json", "w") as f:
|
| 179 |
+
json.dump({"sentences": self.sentences, "sentence_docs": self.sentence_docs}, f)
|
| 180 |
+
|
| 181 |
+
logger.info("Word2Vec saved to %s: %d sentences, %d vocab",
|
| 182 |
+
directory, len(self.sentences), len(self.model.wv))
|
| 183 |
+
return meta
|
| 184 |
+
|
| 185 |
+
@classmethod
|
| 186 |
+
def load(cls, directory: str) -> "Word2VecEngine":
|
| 187 |
+
"""Load a previously saved Word2Vec state from disk."""
|
| 188 |
+
save_dir = Path(directory)
|
| 189 |
+
if not (save_dir / "w2v_meta.json").is_file():
|
| 190 |
+
raise FileNotFoundError(f"No saved Word2Vec state at {directory}")
|
| 191 |
+
|
| 192 |
+
with open(save_dir / "w2v_meta.json") as f:
|
| 193 |
+
meta = json.load(f)
|
| 194 |
+
|
| 195 |
+
engine = cls(
|
| 196 |
+
vector_size=meta["vector_size"],
|
| 197 |
+
window=meta["window"],
|
| 198 |
+
min_count=meta["min_count"],
|
| 199 |
+
epochs=meta["epochs"],
|
| 200 |
+
sg=meta["sg"],
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
engine.model = Word2Vec.load(str(save_dir / "w2v.model"))
|
| 204 |
+
engine.sentence_vecs = np.load(save_dir / "sentence_vecs.npy")
|
| 205 |
+
|
| 206 |
+
with open(save_dir / "w2v_sentences.json") as f:
|
| 207 |
+
data = json.load(f)
|
| 208 |
+
engine.sentences = data["sentences"]
|
| 209 |
+
engine.sentence_docs = data["sentence_docs"]
|
| 210 |
+
|
| 211 |
+
logger.info("Word2Vec loaded from %s: %d sentences, %d vocab",
|
| 212 |
+
directory, len(engine.sentences), len(engine.model.wv))
|
| 213 |
+
return engine
|
| 214 |
+
|
| 215 |
+
@staticmethod
|
| 216 |
+
def has_saved_state(directory: str) -> bool:
|
| 217 |
+
"""Check if a saved Word2Vec state exists at the given directory."""
|
| 218 |
+
return (Path(directory) / "w2v_meta.json").is_file()
|
| 219 |
+
|
| 220 |
# ------------------------------------------------------------------ #
|
| 221 |
|
| 222 |
def _sentence_vector(self, tokens: list[str]) -> np.ndarray:
|