Besjon Cifliku commited on
Commit
9f87ec0
·
1 Parent(s): f930251

feat: simplify the workflow and search patterns

Browse files
frontend/src/App.tsx CHANGED
@@ -10,12 +10,13 @@ import KeywordMatcher from "./components/KeywordMatcher";
10
  import BatchAnalysis from "./components/BatchAnalysis";
11
  import SimilarWords from "./components/SimilarWords";
12
  import ContextAnalysis from "./components/ContextAnalysis";
13
- import EvaluationDashboard from "./components/EvaluationDashboard";
14
  import Word2VecPanel from "./components/Word2VecPanel";
 
15
  import DatasetPanel from "./components/DatasetPanel";
 
16
  import "./styles.css";
17
 
18
- type NavGroup = "data" | "training" | "analysis" | "evaluation";
19
  type TrainingTab = "model" | "w2v";
20
  type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
21
 
@@ -23,7 +24,6 @@ const STEPS: { id: NavGroup; label: string; needsIndex?: boolean }[] = [
23
  { id: "data", label: "Data & Setup" },
24
  { id: "training", label: "Training" },
25
  { id: "analysis", label: "Analysis", needsIndex: true },
26
- { id: "evaluation", label: "Evaluation", needsIndex: true },
27
  ];
28
 
29
  const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
@@ -48,14 +48,22 @@ export default function App() {
48
  const [stats, setStats] = useState<CorpusStats | null>(null);
49
  const [showManualSetup, setShowManualSetup] = useState(false);
50
  const [serverError, setServerError] = useState<string | null>(null);
 
 
 
51
  const ready = stats !== null && stats.index_built;
52
 
53
  useEffect(() => {
54
  checkConnection().then((err) => {
55
  setServerError(err);
56
- // If server is up, try to fetch stats (engine may have been auto-restored)
57
  if (!err) {
58
  api.getStats().then(setStats).catch(() => {});
 
 
 
 
 
 
59
  }
60
  });
61
  const interval = setInterval(() => {
@@ -64,11 +72,99 @@ export default function App() {
64
  return () => clearInterval(interval);
65
  }, []);
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  function handleStepClick(id: NavGroup, needsIndex?: boolean) {
68
  if (needsIndex && !ready) return;
69
  setGroup(id);
70
  }
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  return (
73
  <div className="app">
74
  <header className="app-header">
@@ -91,7 +187,7 @@ export default function App() {
91
  </div>
92
  )}
93
 
94
- {/* Progress Stepper (serves as main navigation) */}
95
  <nav className="stepper">
96
  {STEPS.map((step, i) => {
97
  const disabled = step.needsIndex && !ready;
@@ -119,7 +215,7 @@ export default function App() {
119
  })}
120
  </nav>
121
 
122
- {/* Sub-tabs for groups with multiple views */}
123
  {group === "training" && (
124
  <nav className="subtabs">
125
  {TRAINING_TABS.map((t) => (
@@ -165,7 +261,7 @@ export default function App() {
165
  )}
166
 
167
  {group === "training" && trainingTab === "model" && <TrainingPanel />}
168
- {group === "training" && trainingTab === "w2v" && <Word2VecPanel />}
169
 
170
  {group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
171
  {group === "analysis" && analysisTab === "words" && <SimilarWords />}
@@ -174,8 +270,6 @@ export default function App() {
174
  {group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
175
  {group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
176
  {group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
177
-
178
- {group === "evaluation" && <EvaluationDashboard />}
179
  </main>
180
  </div>
181
  );
 
10
  import BatchAnalysis from "./components/BatchAnalysis";
11
  import SimilarWords from "./components/SimilarWords";
12
  import ContextAnalysis from "./components/ContextAnalysis";
 
13
  import Word2VecPanel from "./components/Word2VecPanel";
14
+ import Word2VecTools from "./components/Word2VecTools";
15
  import DatasetPanel from "./components/DatasetPanel";
16
+ import MetricCard from "./components/MetricCard";
17
  import "./styles.css";
18
 
19
+ type NavGroup = "data" | "training" | "analysis";
20
  type TrainingTab = "model" | "w2v";
21
  type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
22
 
 
24
  { id: "data", label: "Data & Setup" },
25
  { id: "training", label: "Training" },
26
  { id: "analysis", label: "Analysis", needsIndex: true },
 
27
  ];
28
 
29
  const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
 
48
  const [stats, setStats] = useState<CorpusStats | null>(null);
49
  const [showManualSetup, setShowManualSetup] = useState(false);
50
  const [serverError, setServerError] = useState<string | null>(null);
51
+ const [w2vReady, setW2vReady] = useState(false);
52
+ const [w2vInfo, setW2vInfo] = useState<{ vocab_size: number; sentences: number; vector_size: number } | null>(null);
53
+ const [resetLoading, setResetLoading] = useState(false);
54
  const ready = stats !== null && stats.index_built;
55
 
56
  useEffect(() => {
57
  checkConnection().then((err) => {
58
  setServerError(err);
 
59
  if (!err) {
60
  api.getStats().then(setStats).catch(() => {});
61
+ api.w2vStatus().then(res => {
62
+ if (res.ready) {
63
+ setW2vReady(true);
64
+ setW2vInfo({ vocab_size: res.vocab_size!, sentences: res.sentences!, vector_size: res.vector_size! });
65
+ }
66
+ }).catch(() => {});
67
  }
68
  });
69
  const interval = setInterval(() => {
 
72
  return () => clearInterval(interval);
73
  }, []);
74
 
75
+ function handleW2vReady(ready: boolean, info?: { vocab_size: number; sentences: number; vector_size: number }) {
76
+ setW2vReady(ready);
77
+ setW2vInfo(ready && info ? info : null);
78
+ }
79
+
80
+ async function handleReset() {
81
+ setResetLoading(true);
82
+ try {
83
+ await api.w2vReset();
84
+ setW2vReady(false);
85
+ setW2vInfo(null);
86
+ } catch {
87
+ // ignore
88
+ } finally {
89
+ setResetLoading(false);
90
+ }
91
+ }
92
+
93
  function handleStepClick(id: NavGroup, needsIndex?: boolean) {
94
  if (needsIndex && !ready) return;
95
  setGroup(id);
96
  }
97
 
98
+ // ── W2V trained: stats bar + analysis tabs, no stepper ──
99
+ if (w2vReady && w2vInfo) {
100
+ return (
101
+ <div className="app">
102
+ <header className="app-header">
103
+ <h1>Contextual Similarity Engine</h1>
104
+ {stats && (
105
+ <div className="header-stats">
106
+ <span className="badge">{stats.model_name}</span>
107
+ <span className="badge">{stats.total_documents} docs</span>
108
+ <span className="badge">{stats.total_chunks} chunks</span>
109
+ </div>
110
+ )}
111
+ </header>
112
+
113
+ {serverError && (
114
+ <div className="server-error-banner">
115
+ <strong>Server unavailable:</strong> {serverError}
116
+ </div>
117
+ )}
118
+
119
+ {/* W2V stats bar */}
120
+ <div className="content">
121
+ <div className="panel">
122
+ <div style={{ display: "flex", alignItems: "center", justifyContent: "space-between", flexWrap: "wrap", gap: 12 }}>
123
+ <h2 style={{ margin: 0 }}>Word2Vec Baseline</h2>
124
+ <button className="btn btn-secondary" onClick={handleReset} disabled={resetLoading}
125
+ style={{ fontSize: "0.85em" }}>
126
+ {resetLoading ? "Resetting..." : "Reset & Retrain"}
127
+ </button>
128
+ </div>
129
+ <div className="metric-grid" style={{ marginTop: 12 }}>
130
+ <MetricCard value={w2vInfo.vocab_size} label="Vocabulary" />
131
+ <MetricCard value={w2vInfo.sentences} label="Sentences" />
132
+ <MetricCard value={w2vInfo.vector_size} label="Dimensions" />
133
+ </div>
134
+ </div>
135
+
136
+ {/* W2V-specific tools: Similar Words, Compare, Semantic Search */}
137
+ <Word2VecTools />
138
+ </div>
139
+
140
+ {/* Transformer Analysis sub-tabs */}
141
+ <nav className="subtabs">
142
+ {ANALYSIS_TABS.map((t) => (
143
+ <button
144
+ key={t.id}
145
+ className={`subtab ${analysisTab === t.id ? "subtab-active" : ""}`}
146
+ onClick={() => setAnalysisTab(t.id)}
147
+ >
148
+ {t.label}
149
+ </button>
150
+ ))}
151
+ </nav>
152
+
153
+ {/* Analysis content */}
154
+ <main className="content">
155
+ {analysisTab === "context" && <ContextAnalysis />}
156
+ {analysisTab === "words" && <SimilarWords />}
157
+ {analysisTab === "search" && <SemanticSearch />}
158
+ {analysisTab === "compare" && <TextCompare />}
159
+ {analysisTab === "keyword" && <KeywordAnalysis />}
160
+ {analysisTab === "match" && <KeywordMatcher />}
161
+ {analysisTab === "batch" && <BatchAnalysis />}
162
+ </main>
163
+ </div>
164
+ );
165
+ }
166
+
167
+ // ── Normal stepper flow ──
168
  return (
169
  <div className="app">
170
  <header className="app-header">
 
187
  </div>
188
  )}
189
 
190
+ {/* Progress Stepper */}
191
  <nav className="stepper">
192
  {STEPS.map((step, i) => {
193
  const disabled = step.needsIndex && !ready;
 
215
  })}
216
  </nav>
217
 
218
+ {/* Sub-tabs */}
219
  {group === "training" && (
220
  <nav className="subtabs">
221
  {TRAINING_TABS.map((t) => (
 
261
  )}
262
 
263
  {group === "training" && trainingTab === "model" && <TrainingPanel />}
264
+ {group === "training" && trainingTab === "w2v" && <Word2VecPanel onReady={handleW2vReady} />}
265
 
266
  {group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
267
  {group === "analysis" && analysisTab === "words" && <SimilarWords />}
 
270
  {group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
271
  {group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
272
  {group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
 
 
273
  </main>
274
  </div>
275
  );
frontend/src/api.ts CHANGED
@@ -110,6 +110,9 @@ export const api = {
110
  getCorpusTexts: (maxDocs: number = 500) =>
111
  client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
112
 
 
 
 
113
  // ---- Engine persistence ----
114
  saveEngine: () =>
115
  client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
@@ -131,6 +134,18 @@ export const api = {
131
  w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
132
  client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
133
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  w2vCompare: (data: { text_a: string; text_b: string }) =>
135
  client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
136
 
 
110
  getCorpusTexts: (maxDocs: number = 500) =>
111
  client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
112
 
113
+ getDocument: (docId: string) =>
114
+ client.get<{ doc_id: string; text: string; num_chunks: number }>(`/documents/${encodeURIComponent(docId)}`).then(r => r.data),
115
+
116
  // ---- Engine persistence ----
117
  saveEngine: () =>
118
  client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
 
134
  w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
135
  client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
136
 
137
+ w2vInitFromEngine: (params?: { vector_size?: number; window?: number; epochs?: number }) =>
138
+ client.post<W2VInitResponse & { documents_used: number }>(`/w2v/init-from-engine`, null, {
139
+ ...long,
140
+ params: { ...(_sign && { __sign: _sign }), ...params },
141
+ }).then(r => r.data),
142
+
143
+ w2vStatus: () =>
144
+ client.get<{ ready: boolean; vocab_size?: number; sentences?: number; vector_size?: number; has_saved_state?: boolean }>("/w2v/status").then(r => r.data),
145
+
146
+ w2vReset: () =>
147
+ client.post<{ status: string; message: string }>("/w2v/reset").then(r => r.data),
148
+
149
  w2vCompare: (data: { text_a: string; text_b: string }) =>
150
  client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
151
 
frontend/src/components/DocumentViewer.tsx ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, type ReactNode } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+
4
+ interface Props {
5
+ docId: string;
6
+ children: ReactNode;
7
+ }
8
+
9
+ export default function DocumentViewer({ docId, children }: Props) {
10
+ const [expanded, setExpanded] = useState(false);
11
+ const [fullText, setFullText] = useState<string | null>(null);
12
+ const [loading, setLoading] = useState(false);
13
+ const [error, setError] = useState("");
14
+
15
+ async function handleClick() {
16
+ if (expanded) {
17
+ setExpanded(false);
18
+ return;
19
+ }
20
+ if (fullText !== null) {
21
+ setExpanded(true);
22
+ return;
23
+ }
24
+ setLoading(true); setError("");
25
+ try {
26
+ const res = await api.getDocument(docId);
27
+ setFullText(res.text);
28
+ setExpanded(true);
29
+ } catch (err) {
30
+ setError(getErrorMessage(err));
31
+ } finally {
32
+ setLoading(false);
33
+ }
34
+ }
35
+
36
+ return (
37
+ <div>
38
+ <div onClick={handleClick}>
39
+ {children}
40
+ </div>
41
+ {loading && (
42
+ <div style={{ padding: "8px 12px", color: "var(--text-dim)", fontSize: "0.85rem" }}>
43
+ Loading document...
44
+ </div>
45
+ )}
46
+ {error && (
47
+ <div style={{ padding: "8px 12px", color: "var(--danger)", fontSize: "0.85rem" }}>
48
+ {error}
49
+ </div>
50
+ )}
51
+ {expanded && fullText !== null && (
52
+ <div style={{
53
+ background: "var(--bg-elevated, #0d1117)",
54
+ border: "1px solid var(--border)",
55
+ borderTop: "none",
56
+ borderRadius: "0 0 var(--radius) var(--radius)",
57
+ padding: "12px 16px",
58
+ marginTop: -4,
59
+ marginBottom: 8,
60
+ maxHeight: 400,
61
+ overflowY: "auto",
62
+ fontSize: "0.82rem",
63
+ lineHeight: 1.7,
64
+ whiteSpace: "pre-wrap",
65
+ wordBreak: "break-word",
66
+ color: "var(--text)",
67
+ }}>
68
+ <div style={{ display: "flex", justifyContent: "space-between", marginBottom: 8 }}>
69
+ <span style={{ fontWeight: 600 }}>{docId}</span>
70
+ <button
71
+ onClick={(e) => { e.stopPropagation(); setExpanded(false); }}
72
+ style={{
73
+ background: "none", border: "none", color: "var(--text-dim)",
74
+ cursor: "pointer", fontSize: "0.8rem",
75
+ }}>
76
+ Close
77
+ </button>
78
+ </div>
79
+ {fullText}
80
+ </div>
81
+ )}
82
+ </div>
83
+ );
84
+ }
frontend/src/components/SemanticSearch.tsx CHANGED
@@ -4,6 +4,7 @@ import type { QueryResultItem } from "../types";
4
  import { useApiCall } from "../hooks/useApiCall";
5
  import ScoreBar from "./ScoreBar";
6
  import StatusMessage from "./StatusMessage";
 
7
 
8
  export default function SemanticSearch() {
9
  const [query, setQuery] = useState("");
@@ -51,17 +52,19 @@ export default function SemanticSearch() {
51
  <div className="panel">
52
  <h3>Results ({results.length})</h3>
53
  {results.map((r) => (
54
- <div key={`${r.doc_id}-${r.chunk_index}`} className="result-card">
55
- <div className="result-header">
56
- <div>
57
- <span className="badge">#{r.rank}</span>{" "}
58
- <span className="badge">{r.doc_id}</span>{" "}
59
- <span className="tag">chunk {r.chunk_index}</span>
 
 
 
60
  </div>
61
- <ScoreBar score={r.score} />
62
  </div>
63
- <div className="result-text">{r.text}</div>
64
- </div>
65
  ))}
66
  </div>
67
  )}
 
4
  import { useApiCall } from "../hooks/useApiCall";
5
  import ScoreBar from "./ScoreBar";
6
  import StatusMessage from "./StatusMessage";
7
+ import DocumentViewer from "./DocumentViewer";
8
 
9
  export default function SemanticSearch() {
10
  const [query, setQuery] = useState("");
 
52
  <div className="panel">
53
  <h3>Results ({results.length})</h3>
54
  {results.map((r) => (
55
+ <DocumentViewer key={`${r.doc_id}-${r.chunk_index}`} docId={r.doc_id}>
56
+ <div className="result-card" style={{ cursor: "pointer" }}>
57
+ <div className="result-header">
58
+ <div>
59
+ <span className="badge">#{r.rank}</span>{" "}
60
+ <span className="badge">{r.doc_id}</span>{" "}
61
+ <span className="tag">chunk {r.chunk_index}</span>
62
+ </div>
63
+ <ScoreBar score={r.score} />
64
  </div>
65
+ <div className="result-text">{r.text}</div>
66
  </div>
67
+ </DocumentViewer>
 
68
  ))}
69
  </div>
70
  )}
frontend/src/components/TrainingPanel.tsx CHANGED
@@ -1,9 +1,7 @@
1
  import { useState } from "react";
2
  import { api, getErrorMessage } from "../api";
3
- import type { TrainResponse, QueryResultItem, CompareResponse } from "../types";
4
  import { useCorpusLoader } from "../hooks/useCorpusLoader";
5
- import { scoreColor } from "../utils/colors";
6
- import ScoreBar from "./ScoreBar";
7
  import StatusMessage from "./StatusMessage";
8
  import MetricCard from "./MetricCard";
9
  import Toggle from "./Toggle";
@@ -12,11 +10,6 @@ import LogViewer from "./LogViewer";
12
 
13
  type Strategy = "unsupervised" | "contrastive" | "keywords";
14
 
15
- interface SimilarWord {
16
- word: string;
17
- score: number;
18
- }
19
-
20
  const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
21
  { id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
22
  { id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
@@ -42,26 +35,6 @@ export default function TrainingPanel() {
42
 
43
  const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
44
 
45
- // Similar words
46
- const [simWord, setSimWord] = useState("");
47
- const [simTopK, setSimTopK] = useState(10);
48
- const [simResults, setSimResults] = useState<SimilarWord[]>([]);
49
- const [simLoading, setSimLoading] = useState(false);
50
-
51
- // Compare
52
- const [compTextA, setCompTextA] = useState("");
53
- const [compTextB, setCompTextB] = useState("");
54
- const [compResult, setCompResult] = useState<CompareResponse | null>(null);
55
- const [compLoading, setCompLoading] = useState(false);
56
-
57
- // Search
58
- const [queryText, setQueryText] = useState("");
59
- const [queryTopK, setQueryTopK] = useState(5);
60
- const [queryResults, setQueryResults] = useState<QueryResultItem[]>([]);
61
- const [queryLoading, setQueryLoading] = useState(false);
62
-
63
- const ready = result !== null;
64
-
65
  async function handleTrain() {
66
  setTraining(true); setError(""); setResult(null);
67
  try {
@@ -87,42 +60,6 @@ export default function TrainingPanel() {
87
  }
88
  }
89
 
90
- async function handleSimilarWords() {
91
- setSimLoading(true); setError("");
92
- try {
93
- const res = await api.similarWords({ word: simWord, top_k: simTopK });
94
- setSimResults(res.similar);
95
- } catch (err) {
96
- setError(getErrorMessage(err));
97
- } finally {
98
- setSimLoading(false);
99
- }
100
- }
101
-
102
- async function handleCompare() {
103
- setCompLoading(true); setError("");
104
- try {
105
- const res = await api.compare({ text_a: compTextA, text_b: compTextB });
106
- setCompResult(res);
107
- } catch (err) {
108
- setError(getErrorMessage(err));
109
- } finally {
110
- setCompLoading(false);
111
- }
112
- }
113
-
114
- async function handleQuery() {
115
- setQueryLoading(true); setError("");
116
- try {
117
- const res = await api.query({ text: queryText, top_k: queryTopK });
118
- setQueryResults(res.results);
119
- } catch (err) {
120
- setError(getErrorMessage(err));
121
- } finally {
122
- setQueryLoading(false);
123
- }
124
- }
125
-
126
  return (
127
  <div>
128
  {/* 1. Training (strategy + config + corpus merged) */}
@@ -223,127 +160,9 @@ export default function TrainingPanel() {
223
  <MetricCard value={`${result.seconds}s`} label="Time" />
224
  </div>
225
  <StatusMessage type="ok"
226
- message={`Model saved: ${result.model_path} — use this path in the Setup tab.`} />
227
  </div>
228
  )}
229
-
230
- {/* 2. Similar Words */}
231
- <div className="panel">
232
- <h2>2. Similar Words</h2>
233
- <p className="panel-desc">
234
- Find words that appear in similar contexts using transformer embeddings.
235
- </p>
236
- <div className="form-row">
237
- <div className="form-group">
238
- <label>Word</label>
239
- <input value={simWord} onChange={e => setSimWord(e.target.value)}
240
- onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
241
- placeholder="e.g. pizza" />
242
- </div>
243
- <div className="form-group form-group-sm">
244
- <label>Top K</label>
245
- <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
246
- </div>
247
- <div className="form-group form-group-sm">
248
- <label>&nbsp;</label>
249
- <button className="btn btn-primary" onClick={handleSimilarWords}
250
- disabled={simLoading || !simWord.trim()}>
251
- {simLoading ? "Searching..." : "Find"}
252
- </button>
253
- </div>
254
- </div>
255
-
256
- {simResults.length > 0 && (
257
- <table className="data-table" style={{ marginTop: 12 }}>
258
- <thead>
259
- <tr><th>Word</th><th>Similarity</th></tr>
260
- </thead>
261
- <tbody>
262
- {simResults.map((r, i) => (
263
- <tr key={i}>
264
- <td style={{ fontWeight: 600 }}>{r.word}</td>
265
- <td><ScoreBar score={r.score} /></td>
266
- </tr>
267
- ))}
268
- </tbody>
269
- </table>
270
- )}
271
- </div>
272
-
273
- {/* 3. Compare Texts */}
274
- <div className="panel">
275
- <h2>3. Compare Texts</h2>
276
- <p className="panel-desc">
277
- Sentence similarity via transformer contextual embeddings.
278
- </p>
279
- <div className="form-row">
280
- <div className="form-group">
281
- <label>Text A</label>
282
- <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
283
- placeholder="pizza gives me homework" />
284
- </div>
285
- <div className="form-group">
286
- <label>Text B</label>
287
- <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
288
- placeholder="school gives me homework" />
289
- </div>
290
- </div>
291
- <button className="btn btn-primary" onClick={handleCompare}
292
- disabled={compLoading || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
293
- {compLoading ? "Comparing..." : "Compare"}
294
- </button>
295
-
296
- {compResult && (
297
- <div className="similarity-gauge" style={{ marginTop: 16 }}>
298
- <div className="similarity-value"
299
- style={{ color: scoreColor(compResult.similarity) }}>
300
- {compResult.similarity.toFixed(4)}
301
- </div>
302
- <div className="similarity-label">Transformer Cosine Similarity</div>
303
- </div>
304
- )}
305
- </div>
306
-
307
- {/* 4. Semantic Search */}
308
- <div className="panel">
309
- <h2>4. Semantic Search</h2>
310
- <p className="panel-desc">
311
- Search your corpus using transformer embeddings.
312
- </p>
313
- <div className="form-row">
314
- <div className="form-group">
315
- <label>Query</label>
316
- <input value={queryText} onChange={e => setQueryText(e.target.value)}
317
- onKeyDown={e => e.key === "Enter" && handleQuery()}
318
- placeholder="a place where children learn" />
319
- </div>
320
- <div className="form-group form-group-sm">
321
- <label>Top K</label>
322
- <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
323
- </div>
324
- <div className="form-group form-group-sm">
325
- <label>&nbsp;</label>
326
- <button className="btn btn-primary" onClick={handleQuery}
327
- disabled={queryLoading || !queryText.trim()}>
328
- {queryLoading ? "Searching..." : "Search"}
329
- </button>
330
- </div>
331
- </div>
332
-
333
- {queryResults.length > 0 && (
334
- <div style={{ marginTop: 12 }}>
335
- {queryResults.map((r, i) => (
336
- <div key={i} className="result-card">
337
- <div className="result-header">
338
- <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
339
- <ScoreBar score={r.score} />
340
- </div>
341
- <div className="result-text">{r.text}</div>
342
- </div>
343
- ))}
344
- </div>
345
- )}
346
- </div>
347
  </div>
348
  );
349
  }
 
1
  import { useState } from "react";
2
  import { api, getErrorMessage } from "../api";
3
+ import type { TrainResponse } from "../types";
4
  import { useCorpusLoader } from "../hooks/useCorpusLoader";
 
 
5
  import StatusMessage from "./StatusMessage";
6
  import MetricCard from "./MetricCard";
7
  import Toggle from "./Toggle";
 
10
 
11
  type Strategy = "unsupervised" | "contrastive" | "keywords";
12
 
 
 
 
 
 
13
  const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
14
  { id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
15
  { id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
 
35
 
36
  const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  async function handleTrain() {
39
  setTraining(true); setError(""); setResult(null);
40
  try {
 
60
  }
61
  }
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return (
64
  <div>
65
  {/* 1. Training (strategy + config + corpus merged) */}
 
160
  <MetricCard value={`${result.seconds}s`} label="Time" />
161
  </div>
162
  <StatusMessage type="ok"
163
+ message={`Model saved: ${result.model_path} — use this path in the Setup tab, then go to Analysis to explore results.`} />
164
  </div>
165
  )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  </div>
167
  );
168
  }
frontend/src/components/Word2VecPanel.tsx CHANGED
@@ -1,54 +1,43 @@
1
- import { useState } from "react";
2
  import { api, getErrorMessage } from "../api";
3
- import type { W2VInitResponse, W2VQueryResult, W2VSimilarWord, CompareResponse } from "../types";
4
- import { useCorpusLoader } from "../hooks/useCorpusLoader";
5
- import { scoreColor } from "../utils/colors";
6
- import ScoreBar from "./ScoreBar";
7
  import StatusMessage from "./StatusMessage";
8
  import LogViewer from "./LogViewer";
9
  import MetricCard from "./MetricCard";
10
 
11
- export default function Word2VecPanel() {
12
- // Init
 
 
 
 
 
 
13
  const [vectorSize, setVectorSize] = useState(100);
14
  const [windowSize, setWindowSize] = useState(5);
15
  const [w2vEpochs, setW2vEpochs] = useState(50);
16
  const [showAdvanced, setShowAdvanced] = useState(false);
17
  const [initLoading, setInitLoading] = useState(false);
18
- const [initResult, setInitResult] = useState<W2VInitResponse | null>(null);
19
-
20
- const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
21
-
22
- // Similar words
23
- const [simWord, setSimWord] = useState("");
24
- const [simTopK, setSimTopK] = useState(10);
25
- const [simResults, setSimResults] = useState<W2VSimilarWord[]>([]);
26
- const [simLoading, setSimLoading] = useState(false);
27
-
28
- // Compare
29
- const [compTextA, setCompTextA] = useState("");
30
- const [compTextB, setCompTextB] = useState("");
31
- const [compResult, setCompResult] = useState<CompareResponse | null>(null);
32
- const [compLoading, setCompLoading] = useState(false);
33
-
34
- // Query
35
- const [queryText, setQueryText] = useState("");
36
- const [queryTopK, setQueryTopK] = useState(5);
37
- const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
38
- const [queryLoading, setQueryLoading] = useState(false);
39
-
40
- async function handleInit() {
41
- setInitLoading(true); setError(""); setInitResult(null);
42
  try {
43
- const corpus = parseCorpus();
44
- if (!corpus.length) { setError("Corpus is empty."); setInitLoading(false); return; }
45
- const res = await api.w2vInit({
46
- corpus_texts: corpus,
47
  vector_size: vectorSize,
48
  window: windowSize,
49
  epochs: w2vEpochs,
50
  });
51
- setInitResult(res);
52
  } catch (err) {
53
  setError(getErrorMessage(err));
54
  } finally {
@@ -56,76 +45,44 @@ export default function Word2VecPanel() {
56
  }
57
  }
58
 
59
- async function handleSimilarWords() {
60
- setSimLoading(true); setError("");
61
- try {
62
- const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
63
- setSimResults(res.similar);
64
- } catch (err) {
65
- setError(getErrorMessage(err));
66
- } finally {
67
- setSimLoading(false);
68
- }
69
  }
70
 
71
- async function handleCompare() {
72
- setCompLoading(true); setError("");
73
- try {
74
- const res = await api.w2vCompare({ text_a: compTextA, text_b: compTextB });
75
- setCompResult(res);
76
- } catch (err) {
77
- setError(getErrorMessage(err));
78
- } finally {
79
- setCompLoading(false);
80
- }
81
- }
 
 
 
 
 
 
 
82
 
83
- async function handleQuery() {
84
- setQueryLoading(true); setError("");
85
- try {
86
- const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
87
- setQueryResults(res.results);
88
- } catch (err) {
89
- setError(getErrorMessage(err));
90
- } finally {
91
- setQueryLoading(false);
92
- }
93
  }
94
 
95
- const ready = initResult !== null;
96
-
97
  return (
98
  <div>
99
- {/* 1. Training */}
100
  <div className="panel">
101
- <h2>1. Train Word2Vec (gensim)</h2>
102
  <p className="panel-desc">
103
  Static embeddings — one vector per word, no context awareness.
104
- Useful as a baseline to compare against the transformer approach.
105
  </p>
106
- <div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
107
- <button className="btn btn-secondary" onClick={loadFromEngine}
108
- disabled={corpusLoading}>
109
- {corpusLoading ? "Loading..." : "Load from Engine"}
110
- </button>
111
- {corpusText && (
112
- <button className="btn btn-secondary" onClick={() => setCorpusText("")}>
113
- Clear
114
- </button>
115
- )}
116
- </div>
117
- <div className="form-group" style={{ marginBottom: 12 }}>
118
- <label>
119
- Corpus (separate documents with blank lines)
120
- {corpusText && (
121
- <span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
122
- {" "} — {parseCorpus().length} documents detected
123
- </span>
124
- )}
125
- </label>
126
- <textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
127
- placeholder="Document 1 text...\n\nDocument 2 text..." />
128
- </div>
129
  <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
130
  {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
131
  </button>
@@ -149,145 +106,15 @@ export default function Word2VecPanel() {
149
  </div>
150
  )}
151
 
152
- <button className="btn btn-primary" onClick={handleInit}
153
- disabled={initLoading || !corpusText.trim()} style={{ marginTop: 8 }}>
154
- {initLoading ? <><span className="spinner" /> Training...</> : "Train Word2Vec"}
155
  </button>
156
 
157
  <LogViewer active={initLoading} />
158
  </div>
159
 
160
  {error && <StatusMessage type="err" message={error} />}
161
-
162
- {initResult && (
163
- <div className="panel">
164
- <h2>Word2Vec Ready</h2>
165
- <div className="metric-grid">
166
- <MetricCard value={initResult.vocab_size} label="Vocabulary" />
167
- <MetricCard value={initResult.sentences} label="Sentences" />
168
- <MetricCard value={initResult.vector_size} label="Dimensions" />
169
- <MetricCard value={`${initResult.seconds}s`} label="Time" />
170
- </div>
171
- </div>
172
- )}
173
-
174
- {/* 2. Similar Words */}
175
- <div className="panel">
176
- <h2>2. Similar Words</h2>
177
- <p className="panel-desc">
178
- Find words that appear in similar contexts using Word2Vec static embeddings.
179
- </p>
180
- <div className="form-row">
181
- <div className="form-group">
182
- <label>Word</label>
183
- <input value={simWord} onChange={e => setSimWord(e.target.value)}
184
- onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
185
- placeholder="e.g. pizza" />
186
- </div>
187
- <div className="form-group form-group-sm">
188
- <label>Top K</label>
189
- <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
190
- </div>
191
- <div className="form-group form-group-sm">
192
- <label>&nbsp;</label>
193
- <button className="btn btn-primary" onClick={handleSimilarWords}
194
- disabled={simLoading || !ready || !simWord.trim()}>
195
- {simLoading ? "Searching..." : "Find"}
196
- </button>
197
- </div>
198
- </div>
199
-
200
- {simResults.length > 0 && (
201
- <table className="data-table" style={{ marginTop: 12 }}>
202
- <thead>
203
- <tr><th>Word</th><th>Similarity</th></tr>
204
- </thead>
205
- <tbody>
206
- {simResults.map((r, i) => (
207
- <tr key={i}>
208
- <td style={{ fontWeight: 600 }}>{r.word}</td>
209
- <td><ScoreBar score={r.score} /></td>
210
- </tr>
211
- ))}
212
- </tbody>
213
- </table>
214
- )}
215
- </div>
216
-
217
- {/* 3. Compare Texts */}
218
- <div className="panel">
219
- <h2>3. Compare Texts</h2>
220
- <p className="panel-desc">
221
- Sentence similarity via averaged word vectors.
222
- </p>
223
- <div className="form-row">
224
- <div className="form-group">
225
- <label>Text A</label>
226
- <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
227
- placeholder="pizza gives me homework" />
228
- </div>
229
- <div className="form-group">
230
- <label>Text B</label>
231
- <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
232
- placeholder="school gives me homework" />
233
- </div>
234
- </div>
235
- <button className="btn btn-primary" onClick={handleCompare}
236
- disabled={compLoading || !ready || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
237
- {compLoading ? "Comparing..." : "Compare"}
238
- </button>
239
-
240
- {compResult && (
241
- <div className="similarity-gauge" style={{ marginTop: 16 }}>
242
- <div className="similarity-value"
243
- style={{ color: scoreColor(compResult.similarity) }}>
244
- {compResult.similarity.toFixed(4)}
245
- </div>
246
- <div className="similarity-label">Word2Vec Cosine Similarity</div>
247
- </div>
248
- )}
249
- </div>
250
-
251
- {/* 4. Semantic Search */}
252
- <div className="panel">
253
- <h2>4. Semantic Search</h2>
254
- <p className="panel-desc">
255
- Search your corpus using averaged Word2Vec vectors.
256
- </p>
257
- <div className="form-row">
258
- <div className="form-group">
259
- <label>Query</label>
260
- <input value={queryText} onChange={e => setQueryText(e.target.value)}
261
- onKeyDown={e => e.key === "Enter" && handleQuery()}
262
- placeholder="a place where children learn" />
263
- </div>
264
- <div className="form-group form-group-sm">
265
- <label>Top K</label>
266
- <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
267
- </div>
268
- <div className="form-group form-group-sm">
269
- <label>&nbsp;</label>
270
- <button className="btn btn-primary" onClick={handleQuery}
271
- disabled={queryLoading || !ready || !queryText.trim()}>
272
- {queryLoading ? "Searching..." : "Search"}
273
- </button>
274
- </div>
275
- </div>
276
-
277
- {queryResults.length > 0 && (
278
- <div style={{ marginTop: 12 }}>
279
- {queryResults.map((r, i) => (
280
- <div key={i} className="result-card">
281
- <div className="result-header">
282
- <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
283
- <ScoreBar score={r.score} />
284
- </div>
285
- <div className="result-text">{r.text}</div>
286
- </div>
287
- ))}
288
- </div>
289
- )}
290
- </div>
291
  </div>
292
  );
293
  }
 
1
+ import { useState, useEffect } from "react";
2
  import { api, getErrorMessage } from "../api";
3
+ import type { W2VInitResponse } from "../types";
 
 
 
4
  import StatusMessage from "./StatusMessage";
5
  import LogViewer from "./LogViewer";
6
  import MetricCard from "./MetricCard";
7
 
8
+ interface Props {
9
+ onReady: (ready: boolean, info?: { vocab_size: number; sentences: number; vector_size: number }) => void;
10
+ }
11
+
12
+ export default function Word2VecPanel({ onReady }: Props) {
13
+ const [statusChecked, setStatusChecked] = useState(false);
14
+ const [trainResult, setTrainResult] = useState<W2VInitResponse | null>(null);
15
+
16
  const [vectorSize, setVectorSize] = useState(100);
17
  const [windowSize, setWindowSize] = useState(5);
18
  const [w2vEpochs, setW2vEpochs] = useState(50);
19
  const [showAdvanced, setShowAdvanced] = useState(false);
20
  const [initLoading, setInitLoading] = useState(false);
21
+ const [error, setError] = useState("");
22
+
23
+ useEffect(() => {
24
+ api.w2vStatus().then(res => {
25
+ if (res.ready) {
26
+ onReady(true, { vocab_size: res.vocab_size!, sentences: res.sentences!, vector_size: res.vector_size! });
27
+ }
28
+ setStatusChecked(true);
29
+ }).catch(() => setStatusChecked(true));
30
+ }, []);
31
+
32
+ async function handleTrainFromEngine() {
33
+ setInitLoading(true); setError(""); setTrainResult(null);
 
 
 
 
 
 
 
 
 
 
 
34
  try {
35
+ const res = await api.w2vInitFromEngine({
 
 
 
36
  vector_size: vectorSize,
37
  window: windowSize,
38
  epochs: w2vEpochs,
39
  });
40
+ setTrainResult(res);
41
  } catch (err) {
42
  setError(getErrorMessage(err));
43
  } finally {
 
45
  }
46
  }
47
 
48
+ if (!statusChecked) {
49
+ return <div className="panel"><p>Checking Word2Vec status...</p></div>;
 
 
 
 
 
 
 
 
50
  }
51
 
52
+ // Training complete — show results + continue button
53
+ if (trainResult) {
54
+ return (
55
+ <div>
56
+ <div className="panel">
57
+ <h2>Training Complete</h2>
58
+ <div className="metric-grid">
59
+ <MetricCard value={trainResult.vocab_size} label="Vocabulary" />
60
+ <MetricCard value={trainResult.sentences} label="Sentences" />
61
+ <MetricCard value={trainResult.vector_size} label="Dimensions" />
62
+ <MetricCard value={`${trainResult.seconds}s`} label="Train Time" />
63
+ </div>
64
+ <StatusMessage type="ok" message="Word2Vec model trained and saved. It will persist across restarts." />
65
+ <button className="btn btn-primary" style={{ marginTop: 12 }}
66
+ onClick={() => onReady(true, { vocab_size: trainResult.vocab_size, sentences: trainResult.sentences, vector_size: trainResult.vector_size })}>
67
+ Continue to Analysis
68
+ </button>
69
+ </div>
70
 
71
+ <LogViewer active={false} />
72
+ </div>
73
+ );
 
 
 
 
 
 
 
74
  }
75
 
76
+ // Training form
 
77
  return (
78
  <div>
 
79
  <div className="panel">
80
+ <h2>Word2Vec Baseline (gensim)</h2>
81
  <p className="panel-desc">
82
  Static embeddings — one vector per word, no context awareness.
83
+ Train on all documents loaded in the engine to use as a baseline comparison.
84
  </p>
85
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
87
  {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
88
  </button>
 
106
  </div>
107
  )}
108
 
109
+ <button className="btn btn-primary" onClick={handleTrainFromEngine}
110
+ disabled={initLoading} style={{ marginTop: 8 }}>
111
+ {initLoading ? <><span className="spinner" /> Training on all engine documents...</> : "Train Word2Vec"}
112
  </button>
113
 
114
  <LogViewer active={initLoading} />
115
  </div>
116
 
117
  {error && <StatusMessage type="err" message={error} />}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  </div>
119
  );
120
  }
frontend/src/components/Word2VecTools.tsx ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+ import type { W2VQueryResult, W2VSimilarWord, CompareResponse } from "../types";
4
+ import { scoreColor } from "../utils/colors";
5
+ import ScoreBar from "./ScoreBar";
6
+ import StatusMessage from "./StatusMessage";
7
+ import DocumentViewer from "./DocumentViewer";
8
+
9
+ export default function Word2VecTools() {
10
+ const [error, setError] = useState("");
11
+
12
+ // Similar words
13
+ const [simWord, setSimWord] = useState("");
14
+ const [simTopK, setSimTopK] = useState(10);
15
+ const [simResults, setSimResults] = useState<W2VSimilarWord[]>([]);
16
+ const [simLoading, setSimLoading] = useState(false);
17
+
18
+ // Compare
19
+ const [compTextA, setCompTextA] = useState("");
20
+ const [compTextB, setCompTextB] = useState("");
21
+ const [compResult, setCompResult] = useState<CompareResponse | null>(null);
22
+ const [compLoading, setCompLoading] = useState(false);
23
+
24
+ // Search
25
+ const [queryText, setQueryText] = useState("");
26
+ const [queryTopK, setQueryTopK] = useState(5);
27
+ const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
28
+ const [queryLoading, setQueryLoading] = useState(false);
29
+
30
+ async function handleSimilarWords() {
31
+ setSimLoading(true); setError("");
32
+ try {
33
+ const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
34
+ setSimResults(res.similar);
35
+ } catch (err) {
36
+ setError(getErrorMessage(err));
37
+ } finally {
38
+ setSimLoading(false);
39
+ }
40
+ }
41
+
42
+ async function handleCompare() {
43
+ setCompLoading(true); setError("");
44
+ try {
45
+ const res = await api.w2vCompare({ text_a: compTextA, text_b: compTextB });
46
+ setCompResult(res);
47
+ } catch (err) {
48
+ setError(getErrorMessage(err));
49
+ } finally {
50
+ setCompLoading(false);
51
+ }
52
+ }
53
+
54
+ async function handleQuery() {
55
+ setQueryLoading(true); setError("");
56
+ try {
57
+ const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
58
+ setQueryResults(res.results);
59
+ } catch (err) {
60
+ setError(getErrorMessage(err));
61
+ } finally {
62
+ setQueryLoading(false);
63
+ }
64
+ }
65
+
66
+ return (
67
+ <div>
68
+ {error && <StatusMessage type="err" message={error} />}
69
+
70
+ <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: 16 }}>
71
+ {/* Similar Words */}
72
+ <div className="panel">
73
+ <h3 style={{ marginTop: 0 }}>Similar Words</h3>
74
+ <p className="panel-desc">
75
+ Find words that appear in similar contexts using Word2Vec static embeddings.
76
+ </p>
77
+ <div className="form-row">
78
+ <div className="form-group">
79
+ <label>Word</label>
80
+ <input value={simWord} onChange={e => setSimWord(e.target.value)}
81
+ onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
82
+ placeholder="e.g. pizza" />
83
+ </div>
84
+ <div className="form-group form-group-sm">
85
+ <label>Top K</label>
86
+ <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)}
87
+ min={1} max={50} style={{ width: 60 }} />
88
+ </div>
89
+ <div className="form-group form-group-sm">
90
+ <label>&nbsp;</label>
91
+ <button className="btn btn-primary" onClick={handleSimilarWords}
92
+ disabled={simLoading || !simWord.trim()}>
93
+ {simLoading ? "..." : "Find"}
94
+ </button>
95
+ </div>
96
+ </div>
97
+
98
+ {simResults.length > 0 && (
99
+ <table className="data-table" style={{ marginTop: 8 }}>
100
+ <thead>
101
+ <tr><th>Word</th><th>Similarity</th></tr>
102
+ </thead>
103
+ <tbody>
104
+ {simResults.map((r, i) => (
105
+ <tr key={i}>
106
+ <td style={{ fontWeight: 600 }}>{r.word}</td>
107
+ <td><ScoreBar score={r.score} /></td>
108
+ </tr>
109
+ ))}
110
+ </tbody>
111
+ </table>
112
+ )}
113
+ </div>
114
+
115
+ {/* Compare Texts */}
116
+ <div className="panel">
117
+ <h3 style={{ marginTop: 0 }}>Compare Texts</h3>
118
+ <p className="panel-desc">
119
+ Sentence similarity via averaged word vectors.
120
+ </p>
121
+ <div className="form-group" style={{ marginBottom: 8 }}>
122
+ <label>Text A</label>
123
+ <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
124
+ placeholder="pizza gives me homework" />
125
+ </div>
126
+ <div className="form-group" style={{ marginBottom: 8 }}>
127
+ <label>Text B</label>
128
+ <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
129
+ placeholder="school gives me homework" />
130
+ </div>
131
+ <button className="btn btn-primary" onClick={handleCompare}
132
+ disabled={compLoading || !compTextA.trim() || !compTextB.trim()}>
133
+ {compLoading ? "..." : "Compare"}
134
+ </button>
135
+
136
+ {compResult && (
137
+ <div className="similarity-gauge" style={{ marginTop: 12 }}>
138
+ <div className="similarity-value"
139
+ style={{ color: scoreColor(compResult.similarity) }}>
140
+ {compResult.similarity.toFixed(4)}
141
+ </div>
142
+ <div className="similarity-label">Word2Vec Cosine Similarity</div>
143
+ </div>
144
+ )}
145
+ </div>
146
+ </div>
147
+
148
+ {/* Semantic Search — full width */}
149
+ <div className="panel">
150
+ <h3 style={{ marginTop: 0 }}>Semantic Search</h3>
151
+ <p className="panel-desc">
152
+ Search your corpus using averaged Word2Vec vectors.
153
+ </p>
154
+ <div className="form-row">
155
+ <div className="form-group" style={{ flex: 1 }}>
156
+ <label>Query</label>
157
+ <input value={queryText} onChange={e => setQueryText(e.target.value)}
158
+ onKeyDown={e => e.key === "Enter" && handleQuery()}
159
+ placeholder="a place where children learn" />
160
+ </div>
161
+ <div className="form-group form-group-sm">
162
+ <label>Top K</label>
163
+ <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)}
164
+ min={1} max={20} style={{ width: 60 }} />
165
+ </div>
166
+ <div className="form-group form-group-sm">
167
+ <label>&nbsp;</label>
168
+ <button className="btn btn-primary" onClick={handleQuery}
169
+ disabled={queryLoading || !queryText.trim()}>
170
+ {queryLoading ? "Searching..." : "Search"}
171
+ </button>
172
+ </div>
173
+ </div>
174
+
175
+ {queryResults.length > 0 && (
176
+ <div style={{ marginTop: 8 }}>
177
+ {queryResults.map((r, i) => (
178
+ <DocumentViewer key={i} docId={r.doc_id}>
179
+ <div className="result-card" style={{ cursor: "pointer" }}>
180
+ <div className="result-header">
181
+ <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
182
+ <ScoreBar score={r.score} />
183
+ </div>
184
+ <div className="result-text">{r.text}</div>
185
+ </div>
186
+ </DocumentViewer>
187
+ ))}
188
+ </div>
189
+ )}
190
+ </div>
191
+ </div>
192
+ );
193
+ }
frontend/tsconfig.tsbuildinfo ADDED
@@ -0,0 +1 @@
 
 
1
+ {"root":["./src/app.tsx","./src/api.ts","./src/main.tsx","./src/types.ts","./src/vite-env.d.ts","./src/components/batchanalysis.tsx","./src/components/contextanalysis.tsx","./src/components/datasetpanel.tsx","./src/components/documentviewer.tsx","./src/components/enginesetup.tsx","./src/components/evaluationdashboard.tsx","./src/components/keywordanalysis.tsx","./src/components/keywordmatcher.tsx","./src/components/logviewer.tsx","./src/components/metriccard.tsx","./src/components/scorebar.tsx","./src/components/select.tsx","./src/components/semanticsearch.tsx","./src/components/similarwords.tsx","./src/components/statusmessage.tsx","./src/components/switch.tsx","./src/components/textcompare.tsx","./src/components/toggle.tsx","./src/components/trainingpanel.tsx","./src/components/word2vecpanel.tsx","./src/components/word2vectools.tsx","./src/hooks/useapicall.ts","./src/hooks/usecorpusloader.ts","./src/utils/colors.ts"],"version":"5.9.3"}
server.py CHANGED
@@ -145,12 +145,13 @@ evaluator: Optional[Evaluator] = None
145
  w2v_engine: Optional[Word2VecEngine] = None
146
 
147
  ENGINE_SAVE_DIR = Path(os.environ.get("ENGINE_STATE_DIR", str(BASE_DIR / "engine_state")))
 
148
 
149
 
150
  @app.on_event("startup")
151
  def _auto_restore():
152
- """Restore engine state from disk if a previous save exists."""
153
- global engine, evaluator
154
  if (ENGINE_SAVE_DIR / "meta.json").is_file():
155
  try:
156
  engine = ContextualSimilarityEngine.load(str(ENGINE_SAVE_DIR))
@@ -160,6 +161,13 @@ def _auto_restore():
160
  len(engine.chunks), len(engine._doc_ids))
161
  except Exception:
162
  logger.exception("Failed to auto-restore engine state — starting fresh")
 
 
 
 
 
 
 
163
 
164
 
165
  @app.get("/api/logs/stream")
@@ -572,6 +580,18 @@ def get_corpus_texts(max_docs: int = Query(default=500, ge=1, le=10_000)):
572
  return {"documents": result, "count": len(result)}
573
 
574
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  @app.post("/api/engine/save")
576
  def save_engine():
577
  """Save current engine state to disk for later restore."""
@@ -615,9 +635,53 @@ def w2v_init(req: W2VInitRequest):
615
  stats = w2v_engine.build_index()
616
  elapsed = round(time.time() - t0, 2)
617
  logger.info("Word2Vec ready: %s in %.2fs", stats, elapsed)
 
 
 
 
 
618
  return {**stats, "seconds": elapsed}
619
 
620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  @app.post("/api/w2v/compare")
622
  def w2v_compare(req: W2VCompareRequest):
623
  _ensure_w2v()
@@ -642,6 +706,32 @@ def w2v_similar_words(req: W2VWordRequest):
642
  return {"word": req.word, "similar": [{"word": w, "score": round(s, 4)} for w, s in similar]}
643
 
644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  # ------------------------------------------------------------------ #
646
  # Dataset endpoints (HuggingFace Epstein Files)
647
  # ------------------------------------------------------------------ #
 
145
  w2v_engine: Optional[Word2VecEngine] = None
146
 
147
  ENGINE_SAVE_DIR = Path(os.environ.get("ENGINE_STATE_DIR", str(BASE_DIR / "engine_state")))
148
+ W2V_SAVE_DIR = Path(os.environ.get("W2V_STATE_DIR", str(BASE_DIR / "w2v_state")))
149
 
150
 
151
  @app.on_event("startup")
152
  def _auto_restore():
153
+ """Restore engine and W2V state from disk if previous saves exist."""
154
+ global engine, evaluator, w2v_engine
155
  if (ENGINE_SAVE_DIR / "meta.json").is_file():
156
  try:
157
  engine = ContextualSimilarityEngine.load(str(ENGINE_SAVE_DIR))
 
161
  len(engine.chunks), len(engine._doc_ids))
162
  except Exception:
163
  logger.exception("Failed to auto-restore engine state — starting fresh")
164
+ if Word2VecEngine.has_saved_state(str(W2V_SAVE_DIR)):
165
+ try:
166
+ w2v_engine = Word2VecEngine.load(str(W2V_SAVE_DIR))
167
+ logger.info("Auto-restored Word2Vec: %d sentences, %d vocab",
168
+ len(w2v_engine.sentences), len(w2v_engine.model.wv))
169
+ except Exception:
170
+ logger.exception("Failed to auto-restore Word2Vec state — starting fresh")
171
 
172
 
173
  @app.get("/api/logs/stream")
 
580
  return {"documents": result, "count": len(result)}
581
 
582
 
583
+ @app.get("/api/documents/{doc_id}")
584
+ def get_document(doc_id: str):
585
+ """Return the full text of a document by reconstructing its chunks."""
586
+ _ensure_engine()
587
+ chunks = [c for c in engine.chunks if c.doc_id == doc_id]
588
+ if not chunks:
589
+ raise HTTPException(404, f"Document '{doc_id}' not found.")
590
+ chunks.sort(key=lambda c: c.chunk_index)
591
+ full_text = "\n".join(c.text for c in chunks)
592
+ return {"doc_id": doc_id, "text": full_text, "num_chunks": len(chunks)}
593
+
594
+
595
  @app.post("/api/engine/save")
596
  def save_engine():
597
  """Save current engine state to disk for later restore."""
 
635
  stats = w2v_engine.build_index()
636
  elapsed = round(time.time() - t0, 2)
637
  logger.info("Word2Vec ready: %s in %.2fs", stats, elapsed)
638
+ # Auto-save so data persists across restarts
639
+ try:
640
+ w2v_engine.save(str(W2V_SAVE_DIR))
641
+ except Exception:
642
+ logger.warning("Auto-save W2V after init failed", exc_info=True)
643
  return {**stats, "seconds": elapsed}
644
 
645
 
646
+ @app.post("/api/w2v/init-from-engine")
647
+ def w2v_init_from_engine(
648
+ vector_size: int = Query(default=100, ge=50, le=500),
649
+ window: int = Query(default=5, ge=1, le=20),
650
+ epochs: int = Query(default=50, ge=1, le=200),
651
+ ):
652
+ """Train Word2Vec directly from all documents already loaded in the engine.
653
+
654
+ This avoids the round-trip through the frontend and uses ALL engine docs.
655
+ """
656
+ global w2v_engine
657
+ _ensure_engine()
658
+ if not engine.chunks:
659
+ raise HTTPException(400, "No documents in the engine. Load a dataset first.")
660
+
661
+ # Group chunks by doc_id to reconstruct full documents
662
+ docs: dict[str, list[str]] = {}
663
+ for chunk in engine.chunks:
664
+ if chunk.doc_id not in docs:
665
+ docs[chunk.doc_id] = []
666
+ docs[chunk.doc_id].append(chunk.text)
667
+
668
+ logger.info("Word2Vec init from engine: %d documents, vector_size=%d, window=%d, epochs=%d",
669
+ len(docs), vector_size, window, epochs)
670
+ t0 = time.time()
671
+ w2v_engine = Word2VecEngine(vector_size=vector_size, window=window, epochs=epochs)
672
+ for doc_id, chunks_list in docs.items():
673
+ w2v_engine.add_document(doc_id, "\n".join(chunks_list))
674
+ stats = w2v_engine.build_index()
675
+ elapsed = round(time.time() - t0, 2)
676
+ logger.info("Word2Vec ready: %s in %.2fs", stats, elapsed)
677
+ # Auto-save
678
+ try:
679
+ w2v_engine.save(str(W2V_SAVE_DIR))
680
+ except Exception:
681
+ logger.warning("Auto-save W2V after init failed", exc_info=True)
682
+ return {**stats, "seconds": elapsed, "documents_used": len(docs)}
683
+
684
+
685
  @app.post("/api/w2v/compare")
686
  def w2v_compare(req: W2VCompareRequest):
687
  _ensure_w2v()
 
706
  return {"word": req.word, "similar": [{"word": w, "score": round(s, 4)} for w, s in similar]}
707
 
708
 
709
+ @app.get("/api/w2v/status")
710
+ def w2v_status():
711
+ """Check if Word2Vec is loaded (from training or restored from disk)."""
712
+ if w2v_engine is not None and w2v_engine.model is not None:
713
+ return {
714
+ "ready": True,
715
+ "vocab_size": len(w2v_engine.model.wv),
716
+ "sentences": len(w2v_engine.sentences),
717
+ "vector_size": w2v_engine.vector_size,
718
+ }
719
+ has_saved = Word2VecEngine.has_saved_state(str(W2V_SAVE_DIR))
720
+ return {"ready": False, "has_saved_state": has_saved}
721
+
722
+
723
+ @app.post("/api/w2v/reset")
724
+ def w2v_reset():
725
+ """Delete saved Word2Vec state and clear the in-memory model."""
726
+ global w2v_engine
727
+ w2v_engine = None
728
+ import shutil
729
+ if W2V_SAVE_DIR.is_dir():
730
+ shutil.rmtree(str(W2V_SAVE_DIR))
731
+ logger.info("Word2Vec state deleted from %s", W2V_SAVE_DIR)
732
+ return {"status": "ok", "message": "Word2Vec state cleared. You can retrain now."}
733
+
734
+
735
  # ------------------------------------------------------------------ #
736
  # Dataset endpoints (HuggingFace Epstein Files)
737
  # ------------------------------------------------------------------ #
word2vec_baseline.py CHANGED
@@ -16,9 +16,11 @@ Usage:
16
  score = w2v.compare_texts("pizza gives me homework", "school gives me homework")
17
  """
18
 
 
19
  import re
20
  import logging
21
  from dataclasses import dataclass
 
22
  from typing import Optional
23
 
24
  import numpy as np
@@ -145,6 +147,76 @@ class Word2VecEngine:
145
  return 0.0
146
  return float(self.model.wv.similarity(a, b))
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  # ------------------------------------------------------------------ #
149
 
150
  def _sentence_vector(self, tokens: list[str]) -> np.ndarray:
 
16
  score = w2v.compare_texts("pizza gives me homework", "school gives me homework")
17
  """
18
 
19
+ import json
20
  import re
21
  import logging
22
  from dataclasses import dataclass
23
+ from pathlib import Path
24
  from typing import Optional
25
 
26
  import numpy as np
 
147
  return 0.0
148
  return float(self.model.wv.similarity(a, b))
149
 
150
+ # ------------------------------------------------------------------ #
151
+ # Persistence
152
+ # ------------------------------------------------------------------ #
153
+
154
+ def save(self, directory: str) -> dict:
155
+ """Save trained Word2Vec state to disk for later restore."""
156
+ save_dir = Path(directory)
157
+ save_dir.mkdir(parents=True, exist_ok=True)
158
+
159
+ if self.model is None:
160
+ raise RuntimeError("Cannot save: model has not been trained yet.")
161
+
162
+ self.model.save(str(save_dir / "w2v.model"))
163
+ np.save(save_dir / "sentence_vecs.npy", self.sentence_vecs)
164
+
165
+ meta = {
166
+ "vector_size": self.vector_size,
167
+ "window": self.window,
168
+ "min_count": self.min_count,
169
+ "epochs": self.epochs,
170
+ "sg": self.sg,
171
+ "num_sentences": len(self.sentences),
172
+ "vocab_size": len(self.model.wv),
173
+ }
174
+ with open(save_dir / "w2v_meta.json", "w") as f:
175
+ json.dump(meta, f, indent=2)
176
+
177
+ # Save sentences and their doc mappings
178
+ with open(save_dir / "w2v_sentences.json", "w") as f:
179
+ json.dump({"sentences": self.sentences, "sentence_docs": self.sentence_docs}, f)
180
+
181
+ logger.info("Word2Vec saved to %s: %d sentences, %d vocab",
182
+ directory, len(self.sentences), len(self.model.wv))
183
+ return meta
184
+
185
+ @classmethod
186
+ def load(cls, directory: str) -> "Word2VecEngine":
187
+ """Load a previously saved Word2Vec state from disk."""
188
+ save_dir = Path(directory)
189
+ if not (save_dir / "w2v_meta.json").is_file():
190
+ raise FileNotFoundError(f"No saved Word2Vec state at {directory}")
191
+
192
+ with open(save_dir / "w2v_meta.json") as f:
193
+ meta = json.load(f)
194
+
195
+ engine = cls(
196
+ vector_size=meta["vector_size"],
197
+ window=meta["window"],
198
+ min_count=meta["min_count"],
199
+ epochs=meta["epochs"],
200
+ sg=meta["sg"],
201
+ )
202
+
203
+ engine.model = Word2Vec.load(str(save_dir / "w2v.model"))
204
+ engine.sentence_vecs = np.load(save_dir / "sentence_vecs.npy")
205
+
206
+ with open(save_dir / "w2v_sentences.json") as f:
207
+ data = json.load(f)
208
+ engine.sentences = data["sentences"]
209
+ engine.sentence_docs = data["sentence_docs"]
210
+
211
+ logger.info("Word2Vec loaded from %s: %d sentences, %d vocab",
212
+ directory, len(engine.sentences), len(engine.model.wv))
213
+ return engine
214
+
215
+ @staticmethod
216
+ def has_saved_state(directory: str) -> bool:
217
+ """Check if a saved Word2Vec state exists at the given directory."""
218
+ return (Path(directory) / "w2v_meta.json").is_file()
219
+
220
  # ------------------------------------------------------------------ #
221
 
222
  def _sentence_vector(self, tokens: list[str]) -> np.ndarray: