Besjon Cifliku commited on
Commit
db764ae
·
1 Parent(s): 9f009c2

feat: initial project setup

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +42 -0
  2. .gitignore +55 -0
  3. Dockerfile +59 -0
  4. HOWTO.md +390 -0
  5. README.md +193 -2
  6. contextual_similarity.py +850 -0
  7. data_loader.py +286 -0
  8. demo.py +233 -0
  9. docker-compose.yml +18 -0
  10. evaluation.py +547 -0
  11. frontend/.gitignore +24 -0
  12. frontend/README.md +16 -0
  13. frontend/eslint.config.js +29 -0
  14. frontend/index.html +12 -0
  15. frontend/package-lock.json +0 -0
  16. frontend/package.json +30 -0
  17. frontend/public/vite.svg +1 -0
  18. frontend/src/App.tsx +182 -0
  19. frontend/src/api.ts +144 -0
  20. frontend/src/assets/react.svg +1 -0
  21. frontend/src/components/BatchAnalysis.tsx +110 -0
  22. frontend/src/components/ContextAnalysis.tsx +116 -0
  23. frontend/src/components/DatasetPanel.tsx +246 -0
  24. frontend/src/components/EngineSetup.tsx +172 -0
  25. frontend/src/components/EvaluationDashboard.tsx +603 -0
  26. frontend/src/components/KeywordAnalysis.tsx +100 -0
  27. frontend/src/components/KeywordMatcher.tsx +90 -0
  28. frontend/src/components/LogViewer.tsx +71 -0
  29. frontend/src/components/MetricCard.tsx +16 -0
  30. frontend/src/components/ScoreBar.tsx +19 -0
  31. frontend/src/components/Select.tsx +60 -0
  32. frontend/src/components/SemanticSearch.tsx +70 -0
  33. frontend/src/components/SimilarWords.tsx +75 -0
  34. frontend/src/components/StatusMessage.tsx +13 -0
  35. frontend/src/components/Switch.tsx +22 -0
  36. frontend/src/components/TextCompare.tsx +84 -0
  37. frontend/src/components/Toggle.tsx +27 -0
  38. frontend/src/components/TrainingPanel.tsx +349 -0
  39. frontend/src/components/Word2VecPanel.tsx +293 -0
  40. frontend/src/hooks/useApiCall.ts +34 -0
  41. frontend/src/hooks/useCorpusLoader.ts +48 -0
  42. frontend/src/main.tsx +9 -0
  43. frontend/src/styles.css +828 -0
  44. frontend/src/types.ts +302 -0
  45. frontend/src/utils/colors.ts +6 -0
  46. frontend/src/vite-env.d.ts +1 -0
  47. frontend/tsconfig.json +21 -0
  48. frontend/vite.config.ts +15 -0
  49. pyproject.toml +27 -0
  50. requirements.txt +12 -0
.dockerignore ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated data & model artifacts
2
+ engine_state/
3
+ chroma_epstein/
4
+ checkpoints/
5
+ trained_model/
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.py[cod]
10
+ .venv/
11
+ venv/
12
+ *.egg-info/
13
+
14
+ # Node (frontend is built inside Docker)
15
+ frontend/node_modules/
16
+ frontend/dist/
17
+
18
+ # Git
19
+ .git/
20
+ .gitattributes
21
+
22
+ # OS & IDE
23
+ .DS_Store
24
+ .vscode/
25
+ .idea/
26
+
27
+ # HuggingFace cache
28
+ .cache/
29
+
30
+ # Docs (not needed in image)
31
+ HOWTO.md
32
+ README.md
33
+
34
+ # Docker (avoid recursive COPY)
35
+ Dockerfile
36
+ docker-compose.yml
37
+ .dockerignore
38
+
39
+ # Env & logs
40
+ .env
41
+ .env.local
42
+ *.log
.gitignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ .venv/
11
+ venv/
12
+ .Python
13
+
14
+ # Node / Frontend
15
+ frontend/node_modules/
16
+ frontend/dist/
17
+ frontend/dist-ssr/
18
+ npm-debug.log*
19
+ yarn-debug.log*
20
+ pnpm-debug.log*
21
+
22
+ # Generated data & model artifacts
23
+ engine_state/
24
+ chroma_epstein/
25
+ checkpoints/
26
+ trained_model/
27
+ *.faiss
28
+ *.npy
29
+ *.pkl
30
+ *.pickle
31
+
32
+ # HuggingFace cache
33
+ .cache/
34
+
35
+ # OS
36
+ .DS_Store
37
+ Thumbs.db
38
+
39
+ # IDEs
40
+ .vscode/
41
+ .idea/
42
+ *.swp
43
+ *.swo
44
+ *.suo
45
+ *.ntvs*
46
+ *.njsproj
47
+ *.sln
48
+
49
+ # Environment
50
+ .env
51
+ .env.local
52
+ .env.*.local
53
+
54
+ # Logs
55
+ *.log
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================
2
+ # Multi-stage Docker build for Contextual Similarity Engine
3
+ # Single container: React frontend + FastAPI backend
4
+ # Deploys to: HuggingFace Spaces (Docker SDK), local, Railway
5
+ # =============================================================
6
+
7
+ # Stage 1: Build frontend
8
+ FROM node:22-slim AS frontend-build
9
+ WORKDIR /app/frontend
10
+ COPY frontend/package.json frontend/package-lock.json ./
11
+ RUN npm ci
12
+ COPY frontend/ ./
13
+ RUN npm run build
14
+
15
+ # Stage 2: Python runtime
16
+ FROM python:3.12-slim AS runtime
17
+
18
+ # Create non-root user (required by HF Spaces)
19
+ RUN useradd -m -u 1000 appuser
20
+ WORKDIR /app
21
+
22
+ # System deps for faiss-cpu and torch
23
+ RUN apt-get update && apt-get install -y --no-install-recommends \
24
+ build-essential \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Install uv for fast dependency resolution
28
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
29
+
30
+ # Copy dependency files first (cache layer)
31
+ COPY --chown=appuser pyproject.toml uv.lock ./
32
+
33
+ # Install Python dependencies
34
+ RUN uv sync --frozen --no-dev
35
+
36
+ # Copy backend source
37
+ COPY --chown=appuser *.py ./
38
+
39
+ # Copy pre-built frontend
40
+ COPY --chown=appuser --from=frontend-build /app/frontend/dist ./frontend/dist
41
+
42
+ # Data directories (HF cache, engine state, trained models)
43
+ RUN mkdir -p /data/huggingface /data/engine_state /data/trained_model \
44
+ && chown -R appuser:appuser /app /data
45
+
46
+ ENV HF_HOME=/data/huggingface
47
+ ENV TRANSFORMERS_CACHE=/data/huggingface
48
+ ENV ENGINE_STATE_DIR=/data/engine_state
49
+
50
+ # Switch to non-root user
51
+ USER appuser
52
+
53
+ # Expose port (HF Spaces expects 7860, override via PORT env)
54
+ EXPOSE 7860
55
+
56
+ # Run the server — HOST and PORT configurable via env
57
+ ENV HOST=0.0.0.0
58
+ ENV PORT=7860
59
+ CMD ["uv", "run", "python", "server.py"]
HOWTO.md ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contextual Similarity Engine — HOWTO
2
+
3
+ ## Overview
4
+
5
+ This project uses **transformer-based sentence embeddings** to find and compare
6
+ contextual meanings of keywords within large documents. Unlike Word2Vec (static,
7
+ one-vector-per-word), this system **fine-tunes on YOUR corpus** so it learns
8
+ domain-specific patterns — e.g. that "pizza" means "school" in your data.
9
+
10
+ A **Word2Vec (gensim) baseline** is included for comparison, demonstrating why
11
+ contextual embeddings are superior for meaning disambiguation.
12
+
13
+ **The pipeline is: TRAIN → INDEX → ANALYZE → EVALUATE.**
14
+
15
+ **Stack:**
16
+ - **SentenceTransformers** — contextual embeddings (PyTorch)
17
+ - **FAISS** — fast vector similarity search
18
+ - **gensim Word2Vec** — static embedding baseline for comparison
19
+ - **FastAPI** — REST API backend
20
+ - **React + TypeScript** — visualization frontend
21
+ - **scikit-learn** — clustering & evaluation metrics
22
+
23
+ ---
24
+
25
+ ## 1. Install Dependencies
26
+
27
+ ### Python backend (uv — recommended)
28
+
29
+ [uv](https://docs.astral.sh/uv/) is a fast Python package manager that replaces
30
+ `pip`, `venv`, and `requirements.txt` with a single tool and lockfile.
31
+
32
+ ```bash
33
+ # Install uv (if not already installed)
34
+ curl -LsSf https://astral.sh/uv/install.sh | sh
35
+
36
+ # Create a virtual environment and install all dependencies from pyproject.toml
37
+ cd esfiles
38
+ uv sync
39
+
40
+ # Run commands inside the managed environment
41
+ uv run python server.py
42
+ uv run python demo.py
43
+ ```
44
+
45
+ `uv sync` reads `pyproject.toml`, resolves dependencies, creates a `.venv`,
46
+ and generates a `uv.lock` lockfile for reproducible installs. The lockfile
47
+ pins exact versions so every machine gets identical dependencies.
48
+
49
+ **Adding/removing packages:**
50
+
51
+ ```bash
52
+ uv add httpx # add a new dependency
53
+ uv remove httpx # remove it
54
+ uv lock --upgrade # upgrade all packages to latest compatible versions
55
+ ```
56
+
57
+ ### Python backend (pip — alternative)
58
+
59
+ ```bash
60
+ python3 -m venv venv
61
+ source venv/bin/activate
62
+ pip install -r requirements.txt
63
+ ```
64
+
65
+ ### React frontend
66
+
67
+ ```bash
68
+ cd frontend
69
+ npm install
70
+ ```
71
+
72
+ ---
73
+
74
+ ## 2. Quick Start
75
+
76
+ ### CLI demo (Word2Vec vs Transformer comparison)
77
+
78
+ ```bash
79
+ uv run python demo.py
80
+ ```
81
+
82
+ This runs side-by-side comparison:
83
+ 1. Builds both Transformer and Word2Vec engines on the same corpus
84
+ 2. Compares text similarity scores between approaches
85
+ 3. Shows word-level similarity (Word2Vec only — transformers don't do single words)
86
+ 4. Runs semantic search with both engines
87
+ 5. Tests keyword meaning matching ("pizza" → food or school?)
88
+ 6. Demonstrates clustering (transformer can separate meanings, Word2Vec cannot)
89
+
90
+ ### Web UI
91
+
92
+ ```bash
93
+ # Terminal 1: start the API server
94
+ uv run python server.py
95
+
96
+ # Terminal 2: start the React dev server
97
+ cd frontend && npm run dev
98
+ ```
99
+
100
+ - API docs: `http://localhost:8000/docs`
101
+ - Frontend: `http://localhost:5173`
102
+
103
+ ---
104
+
105
+ ## 3. Training Your Model
106
+
107
+ Three strategies, from simplest to most powerful:
108
+
109
+ ### Strategy 1: Unsupervised (TSDAE)
110
+
111
+ No labels needed. Learns your corpus vocabulary and phrasing via denoising autoencoder.
112
+
113
+ ```python
114
+ from training import CorpusTrainer
115
+
116
+ corpus_texts = [open(f).read() for f in your_files]
117
+ trainer = CorpusTrainer(corpus_texts, base_model="all-MiniLM-L6-v2")
118
+
119
+ result = trainer.train_unsupervised(
120
+ output_path="./trained_model",
121
+ epochs=3,
122
+ batch_size=16,
123
+ )
124
+ print(f"Trained on {result['training_pairs']} sentences in {result['seconds']}s")
125
+ ```
126
+
127
+ ### Strategy 2: Contrastive (auto-mined pairs)
128
+
129
+ Adjacent sentences = similar, random sentences = dissimilar. Learns document structure
130
+ using MultipleNegativesRankingLoss with in-batch negatives.
131
+
132
+ ```python
133
+ trainer = CorpusTrainer(corpus_texts)
134
+
135
+ result = trainer.train_contrastive(
136
+ output_path="./trained_model",
137
+ epochs=5,
138
+ batch_size=16,
139
+ )
140
+ ```
141
+
142
+ ### Strategy 3: Keyword-supervised (best if you know the code words)
143
+
144
+ You provide a keyword→meaning map. The trainer auto-generates training pairs:
145
+ keyword-in-context ↔ meaning-substituted version, plus contrastive pairs from
146
+ corpus structure.
147
+
148
+ ```python
149
+ trainer = CorpusTrainer(corpus_texts)
150
+
151
+ result = trainer.train_with_keywords(
152
+ keyword_meanings={"pizza": "school", "pepperoni": "math class"},
153
+ output_path="./trained_model",
154
+ epochs=5,
155
+ batch_size=16,
156
+ )
157
+ print(f"Keywords: {result['keywords']}")
158
+ ```
159
+
160
+ ### Verifying training worked
161
+
162
+ ```python
163
+ # Compare base model vs trained model on test pairs
164
+ comparison = trainer.evaluate_model(
165
+ test_pairs=[
166
+ ("pizza gives me homework", "school gives me homework", 0.95),
167
+ ("pizza gives me homework", "I ate delicious pizza", 0.1),
168
+ ("The pizza test is hard", "The school exam is difficult", 0.9),
169
+ ],
170
+ trained_model_path="./trained_model",
171
+ )
172
+
173
+ print(f"Base error: {comparison['summary']['avg_base_error']:.4f}")
174
+ print(f"Trained error: {comparison['summary']['avg_trained_error']:.4f}")
175
+ print(f"Reduction: {comparison['summary']['error_reduction_pct']:.1f}%")
176
+ print(f"Improved: {comparison['summary']['improved']}/{comparison['summary']['total']}")
177
+ ```
178
+
179
+ ---
180
+
181
+ ## 4. Using Your Trained Model
182
+
183
+ After training, use the saved model path instead of the pretrained model name:
184
+
185
+ ```python
186
+ from contextual_similarity import ContextualSimilarityEngine
187
+
188
+ engine = ContextualSimilarityEngine(model_name="./trained_model")
189
+
190
+ engine.add_document("doc1", open("doc1.txt").read())
191
+ engine.build_index()
192
+
193
+ # Queries now use your domain-trained embeddings
194
+ results = engine.query("pizza homework", top_k=10)
195
+ matches = engine.match_keyword_to_meaning("pizza", [
196
+ "Italian food, restaurant, cooking",
197
+ "School, education, homework and tests",
198
+ ])
199
+ ```
200
+
201
+ ---
202
+
203
+ ## 5. Word2Vec Baseline Comparison
204
+
205
+ A gensim Word2Vec engine is included to demonstrate the difference between
206
+ static and contextual embeddings:
207
+
208
+ ```python
209
+ from word2vec_baseline import Word2VecEngine
210
+
211
+ w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
212
+ for doc_id, text in docs.items():
213
+ w2v.add_document(doc_id, text)
214
+ w2v.build_index()
215
+
216
+ # Word-level: which words appear in similar contexts?
217
+ w2v.most_similar_words("pizza", top_k=5)
218
+
219
+ # Sentence-level: averaged word vectors (lossy)
220
+ w2v.compare_texts("pizza gives me homework", "school gives me homework")
221
+
222
+ # Search
223
+ w2v.query("a place where children learn", top_k=3)
224
+ ```
225
+
226
+ **Key limitation:** Word2Vec gives ONE vector per word. "pizza" always has the
227
+ same embedding whether it means food or school. Transformers encode the full
228
+ surrounding context, so the same word gets different embeddings in different passages.
229
+
230
+ ---
231
+
232
+ ## 6. Using the Web UI
233
+
234
+ 1. **Train Model** (start here):
235
+ - Paste your corpus (documents separated by blank lines)
236
+ - Choose strategy: Unsupervised, Contrastive, or Keyword-supervised
237
+ - For keyword strategy, provide a JSON keyword→meaning map
238
+ - Configure base model, epochs, batch size, output path
239
+ - Click "Start Training" — model trains and saves to disk
240
+ - Run "Compare Models" to evaluate base vs trained
241
+
242
+ 2. **Setup:**
243
+ - Initialize engine with your trained model path (e.g. `./trained_model`)
244
+ - Add documents and build the FAISS index
245
+
246
+ 3. **Semantic Search:** query the corpus with trained embeddings
247
+ 4. **Compare Texts:** cosine similarity between any two texts
248
+ 5. **Keyword Analysis:** auto-cluster keyword meanings across documents
249
+ 6. **Keyword Matcher:** match keyword occurrences to candidate meanings
250
+ 7. **Batch Analysis:** multi-keyword analysis with cross-similarity matrix
251
+ 8. **Evaluation:** disambiguation accuracy, retrieval P@K/MRR, similarity histograms
252
+
253
+ ---
254
+
255
+ ## 7. API Endpoints
256
+
257
+ ### Training
258
+ | Method | Endpoint | Description |
259
+ |--------|----------|-------------|
260
+ | POST | `/api/train/unsupervised` | TSDAE domain adaptation |
261
+ | POST | `/api/train/contrastive` | Contrastive with auto-mined pairs |
262
+ | POST | `/api/train/keywords` | Keyword-supervised training |
263
+ | POST | `/api/train/evaluate` | Compare base vs trained model |
264
+
265
+ ### Engine
266
+ | Method | Endpoint | Description |
267
+ |--------|----------|-------------|
268
+ | POST | `/api/init` | Initialize engine with a model |
269
+ | POST | `/api/documents` | Add a document to the corpus |
270
+ | POST | `/api/documents/upload` | Upload a file as a document |
271
+ | POST | `/api/index/build` | Build FAISS index |
272
+ | POST | `/api/query` | Semantic search |
273
+ | POST | `/api/compare` | Compare two texts |
274
+ | POST | `/api/analyze/keyword` | Single keyword analysis |
275
+ | POST | `/api/analyze/batch` | Multi-keyword batch analysis |
276
+ | POST | `/api/match` | Match keyword to candidate meanings |
277
+ | GET | `/api/stats` | Corpus statistics |
278
+
279
+ ### Evaluation
280
+ | Method | Endpoint | Description |
281
+ |--------|----------|-------------|
282
+ | POST | `/api/eval/disambiguation` | Disambiguation accuracy |
283
+ | POST | `/api/eval/retrieval` | Retrieval metrics (P@K, MRR, NDCG) |
284
+ | GET | `/api/eval/similarity-distribution` | Pairwise similarity histogram |
285
+
286
+ ### Word2Vec Baseline
287
+ | Method | Endpoint | Description |
288
+ |--------|----------|-------------|
289
+ | POST | `/api/w2v/init` | Train Word2Vec on corpus |
290
+ | POST | `/api/w2v/compare` | Compare two texts (averaged word vectors) |
291
+ | POST | `/api/w2v/query` | Search corpus |
292
+ | POST | `/api/w2v/similar-words` | Find similar words |
293
+
294
+ ---
295
+
296
+ ## 8. Available Base Models
297
+
298
+ | Model | Dim | Size | Quality | Speed |
299
+ |-------|-----|------|---------|-------|
300
+ | `all-MiniLM-L6-v2` | 384 | ~80MB | Good | Fast |
301
+ | `all-mpnet-base-v2` | 768 | ~420MB | Best | Medium |
302
+
303
+ Start with `all-MiniLM-L6-v2` for fast iteration, upgrade to `all-mpnet-base-v2`
304
+ for production quality.
305
+
306
+ ---
307
+
308
+ ## 9. Evaluation Metrics
309
+
310
+ | Metric | What it measures |
311
+ |--------|-----------------|
312
+ | **Accuracy** | % of keyword occurrences correctly matched to their meaning |
313
+ | **Weighted F1** | Harmonic mean of precision/recall, weighted by class frequency |
314
+ | **MRR** | Mean Reciprocal Rank — how early the first relevant result appears |
315
+ | **P@K** | Precision at K — fraction of top-K results that are relevant |
316
+ | **NDCG@K** | Normalized Discounted Cumulative Gain — ranking quality metric |
317
+
318
+ ---
319
+
320
+ ## 10. Tuning Parameters
321
+
322
+ ### Training
323
+
324
+ | Parameter | Default | Notes |
325
+ |-----------|---------|-------|
326
+ | `epochs` | 3-5 | More = better fit but risk overfitting |
327
+ | `batch_size` | 16 | Larger = faster, needs more memory. MNRL benefits from larger batches |
328
+ | `context_window` | 2 | (Keyword strategy) sentences around keyword to include as context |
329
+
330
+ ### Engine
331
+
332
+ | Parameter | Default | Notes |
333
+ |-----------|---------|-------|
334
+ | `chunk_size` | 512 | Characters per chunk. Larger = more context per chunk |
335
+ | `chunk_overlap` | 128 | Overlap prevents losing context at chunk boundaries |
336
+ | `batch_size` | 64 | Encoding batch size for FAISS indexing |
337
+
338
+ ---
339
+
340
+ ## 11. Computational Resources
341
+
342
+ | Task | CPU | GPU (CUDA/MPS) | RAM |
343
+ |------|-----|----------------|-----|
344
+ | Training (small, <1K pairs) | OK | Faster (2-5x) | 4GB+ |
345
+ | Training (medium, 1K-10K pairs) | Slow | Recommended | 8GB+ |
346
+ | Training (large, 10K+ pairs) | Very slow | Required | 16GB+ |
347
+ | Indexing (1K chunks) | OK | Faster | 4GB+ |
348
+ | Querying | Fast | N/A | 2GB+ |
349
+
350
+ **Minimum:** MacBook with 8GB RAM can train small models on CPU.
351
+ **Recommended:** 16GB RAM + GPU (NVIDIA CUDA or Apple Silicon MPS).
352
+
353
+ ---
354
+
355
+ ## 12. Project Structure
356
+
357
+ ```
358
+ esfiles/
359
+ ├── pyproject.toml # Project config & dependencies (uv)
360
+ ├── requirements.txt # Fallback for pip users
361
+ ├── contextual_similarity.py # Core engine: chunking, embedding, FAISS, analysis
362
+ ├── training.py # Training pipeline: 3 strategies + evaluation
363
+ ├── evaluation.py # Evaluation pipeline: metrics, reports
364
+ ├── word2vec_baseline.py # Gensim Word2Vec baseline for comparison
365
+ ├── server.py # FastAPI REST API
366
+ ├── demo.py # CLI demo: Word2Vec vs Transformer comparison
367
+ ├── HOWTO.md # This file
368
+ └── frontend/ # React + TypeScript UI
369
+ ├── package.json
370
+ ├── tsconfig.json
371
+ ├── vite.config.ts
372
+ ├── index.html
373
+ └── src/
374
+ ├── main.tsx
375
+ ├── App.tsx
376
+ ├── styles.css
377
+ ├── types.ts
378
+ ├── api.ts
379
+ └── components/
380
+ ├── ScoreBar.tsx
381
+ ├── StatusMessage.tsx
382
+ ├── TrainingPanel.tsx
383
+ ├── EngineSetup.tsx
384
+ ├── SemanticSearch.tsx
385
+ ├── TextCompare.tsx
386
+ ├── KeywordAnalysis.tsx
387
+ ├── KeywordMatcher.tsx
388
+ ├── BatchAnalysis.tsx
389
+ └── EvaluationDashboard.tsx
390
+ ```
README.md CHANGED
@@ -1,12 +1,203 @@
1
  ---
2
  title: Esfiles
3
- emoji: 🏢
4
  colorFrom: green
5
  colorTo: green
6
  sdk: docker
 
7
  pinned: false
8
  license: apache-2.0
9
  short_description: 'A prototype to analyze embeddings and word correlations '
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Esfiles
3
+ emoji: "\U0001F3E2"
4
  colorFrom: green
5
  colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  license: apache-2.0
10
  short_description: 'A prototype to analyze embeddings and word correlations '
11
  ---
12
 
13
+ # Esfiles Contextual Similarity Engine
14
+
15
+ A tool for analyzing word meanings in context using **transformer-based embeddings**. Unlike traditional approaches (Word2Vec) that assign one static vector per word, this system **fine-tunes on your corpus** so the same word gets different embeddings depending on its surrounding context — e.g. detecting that "pizza" is used as code for "school" in a set of documents.
16
+
17
+ Includes a **Word2Vec baseline** for side-by-side comparison.
18
+
19
+ ## Stack
20
+
21
+ | Layer | Technology |
22
+ |-------|-----------|
23
+ | Embeddings | SentenceTransformers (PyTorch) |
24
+ | Vector search | FAISS |
25
+ | Baseline | gensim Word2Vec |
26
+ | Backend | FastAPI (Python) |
27
+ | Frontend | React 19 + TypeScript + Vite |
28
+ | Evaluation | scikit-learn metrics |
29
+ | Deployment | Docker (HuggingFace Spaces, local, Railway) |
30
+
31
+ ## Prerequisites
32
+
33
+ - **Python 3.11+**
34
+ - **Node.js 18+** (for frontend)
35
+ - [uv](https://docs.astral.sh/uv/) (recommended) or pip
36
+
37
+ ## Setup
38
+
39
+ ### 1. Clone the repo
40
+
41
+ ```bash
42
+ git clone <repo-url>
43
+ cd esfiles
44
+ ```
45
+
46
+ ### 2. Install Python dependencies
47
+
48
+ **With uv (recommended):**
49
+
50
+ ```bash
51
+ curl -LsSf https://astral.sh/uv/install.sh | sh
52
+ uv sync
53
+ ```
54
+
55
+ **With pip:**
56
+
57
+ ```bash
58
+ python3 -m venv venv
59
+ source venv/bin/activate
60
+ pip install -r requirements.txt
61
+ ```
62
+
63
+ ### 3. Install frontend dependencies
64
+
65
+ ```bash
66
+ cd frontend
67
+ npm install
68
+ cd ..
69
+ ```
70
+
71
+ ## Usage
72
+
73
+ ### CLI demo
74
+
75
+ Run the Word2Vec vs Transformer comparison demo:
76
+
77
+ ```bash
78
+ uv run python demo.py
79
+ ```
80
+
81
+ This builds both engines on a sample corpus and compares similarity scores, semantic search, keyword matching, and clustering.
82
+
83
+ ### Web UI (development)
84
+
85
+ ```bash
86
+ # Terminal 1 — API server
87
+ uv run python server.py
88
+
89
+ # Terminal 2 — React dev server
90
+ cd frontend && npm run dev
91
+ ```
92
+
93
+ - **API docs:** http://localhost:8000/docs
94
+ - **Frontend:** http://localhost:5173
95
+
96
+ ### Docker
97
+
98
+ ```bash
99
+ docker compose up --build
100
+ ```
101
+
102
+ The app will be available at http://localhost:8000. The Docker build compiles the React frontend and bundles it with the FastAPI server in a single container.
103
+
104
+ ## How it works
105
+
106
+ **Pipeline: TRAIN → INDEX → ANALYZE → EVALUATE**
107
+
108
+ 1. **Train** — Fine-tune a pretrained sentence-transformer on your corpus using one of three strategies:
109
+ - **Unsupervised (TSDAE):** No labels needed. Learns vocabulary and phrasing via denoising autoencoder.
110
+ - **Contrastive:** Auto-mines training pairs from document structure (adjacent sentences = similar).
111
+ - **Keyword-supervised:** You provide a keyword→meaning map (e.g. `{"pizza": "school"}`). The trainer generates context-aware training pairs.
112
+
113
+ 2. **Index** — Chunk your documents and encode them into a FAISS vector index using the fine-tuned model.
114
+
115
+ 3. **Analyze** — Query the index with semantic search, compare texts, analyze keyword meanings across documents, or match keywords to candidate meanings.
116
+
117
+ 4. **Evaluate** — Measure disambiguation accuracy, retrieval metrics (P@K, MRR, NDCG), and clustering quality (NMI).
118
+
119
+ ## API endpoints
120
+
121
+ ### Training
122
+ | Method | Endpoint | Description |
123
+ |--------|----------|-------------|
124
+ | POST | `/api/train/unsupervised` | TSDAE domain adaptation |
125
+ | POST | `/api/train/contrastive` | Contrastive with auto-mined pairs |
126
+ | POST | `/api/train/keywords` | Keyword-supervised training |
127
+ | POST | `/api/train/evaluate` | Compare base vs trained model |
128
+
129
+ ### Engine
130
+ | Method | Endpoint | Description |
131
+ |--------|----------|-------------|
132
+ | POST | `/api/init` | Initialize engine with a model |
133
+ | POST | `/api/documents` | Add a document |
134
+ | POST | `/api/documents/upload` | Upload a file as a document |
135
+ | POST | `/api/index/build` | Build FAISS index |
136
+ | POST | `/api/query` | Semantic search |
137
+ | POST | `/api/compare` | Compare two texts |
138
+ | POST | `/api/analyze/keyword` | Single keyword analysis |
139
+ | POST | `/api/analyze/batch` | Multi-keyword batch analysis |
140
+ | POST | `/api/match` | Match keyword to candidate meanings |
141
+ | GET | `/api/stats` | Corpus statistics |
142
+
143
+ ### Evaluation
144
+ | Method | Endpoint | Description |
145
+ |--------|----------|-------------|
146
+ | POST | `/api/eval/disambiguation` | Disambiguation accuracy |
147
+ | POST | `/api/eval/retrieval` | Retrieval metrics (P@K, MRR, NDCG) |
148
+ | GET | `/api/eval/similarity-distribution` | Pairwise similarity histogram |
149
+
150
+ ### Word2Vec baseline
151
+ | Method | Endpoint | Description |
152
+ |--------|----------|-------------|
153
+ | POST | `/api/w2v/init` | Train Word2Vec on corpus |
154
+ | POST | `/api/w2v/compare` | Compare two texts |
155
+ | POST | `/api/w2v/query` | Search corpus |
156
+ | POST | `/api/w2v/similar-words` | Find similar words |
157
+
158
+ Full interactive docs available at `/docs` when the server is running.
159
+
160
+ ## Project structure
161
+
162
+ ```
163
+ esfiles/
164
+ ├── pyproject.toml # Dependencies (uv)
165
+ ├── requirements.txt # Fallback for pip
166
+ ├── uv.lock # Lockfile for reproducible installs
167
+ ├── contextual_similarity.py # Core engine: chunking, embedding, FAISS, analysis
168
+ ├── training.py # Training pipeline: 3 strategies + evaluation
169
+ ├── evaluation.py # Evaluation: metrics, reports
170
+ ├── word2vec_baseline.py # gensim Word2Vec baseline
171
+ ├── data_loader.py # Epstein Files dataset loader (HuggingFace + ChromaDB)
172
+ ├── server.py # FastAPI REST API
173
+ ├── demo.py # CLI demo: Word2Vec vs Transformer comparison
174
+ ├── Dockerfile # Multi-stage build (Node + Python)
175
+ ├── docker-compose.yml # Local Docker setup
176
+ ├── HOWTO.md # In-depth usage guide
177
+ └── frontend/ # React + TypeScript UI
178
+ ├── package.json
179
+ ├── vite.config.ts
180
+ ├── index.html
181
+ └── src/
182
+ ├── App.tsx # Main app with tab navigation
183
+ ├── api.ts # API client
184
+ ├── types.ts # TypeScript types
185
+ └── components/ # UI components (training, search, evaluation, etc.)
186
+ ```
187
+
188
+ ## Base models
189
+
190
+ | Model | Dimensions | Quality | Speed |
191
+ |-------|-----------|---------|-------|
192
+ | `all-MiniLM-L6-v2` | 384 | Good | Fast |
193
+ | `all-mpnet-base-v2` | 768 | Best | Medium |
194
+
195
+ Start with `all-MiniLM-L6-v2` for iteration, use `all-mpnet-base-v2` for production.
196
+
197
+ ## Further reading
198
+
199
+ See [HOWTO.md](HOWTO.md) for detailed usage examples including Python API usage, training configuration, tuning parameters, and evaluation metrics.
200
+
201
+ ## License
202
+
203
+ Apache 2.0
contextual_similarity.py ADDED
@@ -0,0 +1,850 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Contextual Word Similarity Engine
3
+
4
+ Uses transformer-based sentence embeddings (SentenceTransformers) and FAISS
5
+ vector search to find and compare contextual meanings of keywords within
6
+ large documents. Unlike static embeddings (Word2Vec/GloVe), this captures
7
+ how word meaning changes based on surrounding context.
8
+
9
+ Usage:
10
+ engine = ContextualSimilarityEngine()
11
+ engine.add_document("my_doc", text)
12
+ engine.build_index()
13
+ results = engine.analyze_keyword("pizza", top_k=10)
14
+ """
15
+
16
+ import re
17
+ import logging
18
+ from dataclasses import dataclass, field
19
+ from pathlib import Path
20
+ from typing import Optional
21
+
22
+ import faiss
23
+ import numpy as np
24
+ from sentence_transformers import SentenceTransformer, util
25
+ from sklearn.cluster import AgglomerativeClustering
26
+ from tqdm import tqdm
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @dataclass
32
+ class Chunk:
33
+ """A passage of text from a document with metadata."""
34
+ text: str
35
+ doc_id: str
36
+ chunk_index: int
37
+ start_char: int
38
+ end_char: int
39
+
40
+ def __repr__(self):
41
+ preview = self.text[:80].replace("\n", " ")
42
+ return f"Chunk(doc={self.doc_id!r}, idx={self.chunk_index}, text={preview!r}...)"
43
+
44
+
45
+ @dataclass
46
+ class SimilarityResult:
47
+ """A single similarity match."""
48
+ chunk: Chunk
49
+ score: float
50
+ rank: int
51
+
52
+
53
+ @dataclass
54
+ class KeywordContext:
55
+ """A keyword occurrence with its surrounding context and embedding."""
56
+ keyword: str
57
+ chunk: Chunk
58
+ highlight_positions: list = field(default_factory=list)
59
+
60
+
61
+ @dataclass
62
+ class KeywordAnalysis:
63
+ """Full analysis of a keyword's contextual meanings across a corpus."""
64
+ keyword: str
65
+ total_occurrences: int
66
+ meaning_clusters: list = field(default_factory=list)
67
+ cross_keyword_similarities: dict = field(default_factory=dict)
68
+
69
+
70
+ class ContextualSimilarityEngine:
71
+ """
72
+ Engine for contextual word similarity analysis using transformer embeddings.
73
+
74
+ Loads documents, chunks them into passages, embeds with a SentenceTransformer
75
+ model, indexes with FAISS, and provides methods to:
76
+ - Find all contextual usages of a keyword
77
+ - Cluster keyword usages into distinct meanings
78
+ - Compare keyword contexts across documents
79
+ - Find passages most similar to a query
80
+ - Batch-analyze multiple keywords
81
+ """
82
+
83
+ def __init__(
84
+ self,
85
+ model_name: str = "all-MiniLM-L6-v2",
86
+ chunk_size: int = 512,
87
+ chunk_overlap: int = 128,
88
+ device: Optional[str] = None,
89
+ batch_size: int = 64,
90
+ ):
91
+ """
92
+ Args:
93
+ model_name: HuggingFace SentenceTransformer model name.
94
+ - "all-MiniLM-L6-v2": fast, good quality (384-dim)
95
+ - "all-mpnet-base-v2": best quality general-purpose (768-dim)
96
+ - "BAAI/bge-large-en-v1.5": high accuracy, larger (1024-dim)
97
+ chunk_size: Max characters per chunk.
98
+ chunk_overlap: Overlap between consecutive chunks (preserves context at boundaries).
99
+ device: PyTorch device ("cpu", "cuda", "mps"). Auto-detected if None.
100
+ batch_size: Batch size for encoding (tune for your GPU memory).
101
+ """
102
+ logger.info(f"Loading model: {model_name}")
103
+ self._model_name = model_name
104
+ self.model = SentenceTransformer(model_name, device=device)
105
+ self.chunk_size = chunk_size
106
+ self.chunk_overlap = chunk_overlap
107
+ self.batch_size = batch_size
108
+ self.embedding_dim = self.model.get_sentence_embedding_dimension()
109
+
110
+ # Storage
111
+ self.chunks: list[Chunk] = []
112
+ self.embeddings: Optional[np.ndarray] = None
113
+ self.index: Optional[faiss.IndexFlatIP] = None
114
+ self._doc_ids: set[str] = set()
115
+
116
+ # ------------------------------------------------------------------ #
117
+ # Document loading & chunking
118
+ # ------------------------------------------------------------------ #
119
+
120
+ def add_document(self, doc_id: str, text: str) -> list[Chunk]:
121
+ """
122
+ Chunk a document and add it to the corpus.
123
+
124
+ Args:
125
+ doc_id: Unique identifier for this document.
126
+ text: Full document text.
127
+
128
+ Returns:
129
+ List of Chunk objects created from this document.
130
+ """
131
+ if doc_id in self._doc_ids:
132
+ raise ValueError(f"Document '{doc_id}' already added. Use a unique doc_id.")
133
+ self._doc_ids.add(doc_id)
134
+
135
+ new_chunks = self._chunk_text(text, doc_id)
136
+ self.chunks.extend(new_chunks)
137
+ logger.info(f"Added document '{doc_id}': {len(new_chunks)} chunks")
138
+
139
+ # Invalidate index so user must rebuild
140
+ self.embeddings = None
141
+ self.index = None
142
+
143
+ return new_chunks
144
+
145
+ def add_document_from_file(self, file_path: str, doc_id: Optional[str] = None) -> list[Chunk]:
146
+ """Load a text file and add it as a document."""
147
+ path = Path(file_path).resolve()
148
+ base_dir = Path(__file__).parent.resolve()
149
+ if not path.is_relative_to(base_dir):
150
+ raise ValueError("File path must be within the project directory.")
151
+ if not path.exists():
152
+ raise FileNotFoundError(f"File not found: {file_path}")
153
+ text = path.read_text(encoding="utf-8")
154
+ return self.add_document(doc_id or path.stem, text)
155
+
156
+ def _chunk_text(self, text: str, doc_id: str) -> list[Chunk]:
157
+ """
158
+ Split text into overlapping chunks, breaking at sentence boundaries
159
+ when possible to preserve semantic coherence.
160
+ """
161
+ # Normalize whitespace
162
+ text = re.sub(r"\n{3,}", "\n\n", text)
163
+
164
+ chunks = []
165
+ start = 0
166
+ chunk_idx = 0
167
+
168
+ while start < len(text):
169
+ end = start + self.chunk_size
170
+
171
+ # If we're not at the end, try to break at a sentence boundary
172
+ if end < len(text):
173
+ # Look for sentence-ending punctuation near the chunk boundary
174
+ search_region = text[max(end - 100, start):end]
175
+ # Find last sentence break in the search region
176
+ for sep in [". ", ".\n", "! ", "!\n", "? ", "?\n", "\n\n"]:
177
+ last_break = search_region.rfind(sep)
178
+ if last_break != -1:
179
+ end = max(end - 100, start) + last_break + len(sep)
180
+ break
181
+
182
+ chunk_text = text[start:end].strip()
183
+ if chunk_text:
184
+ chunks.append(Chunk(
185
+ text=chunk_text,
186
+ doc_id=doc_id,
187
+ chunk_index=chunk_idx,
188
+ start_char=start,
189
+ end_char=end,
190
+ ))
191
+ chunk_idx += 1
192
+
193
+ # Advance with overlap
194
+ start = end - self.chunk_overlap if end < len(text) else end
195
+
196
+ return chunks
197
+
198
+ # ------------------------------------------------------------------ #
199
+ # Embedding & indexing
200
+ # ------------------------------------------------------------------ #
201
+
202
+ def build_index(self, normalize: bool = True, show_progress: bool = True) -> None:
203
+ """
204
+ Embed all chunks and build a FAISS index for fast similarity search.
205
+
206
+ Args:
207
+ normalize: L2-normalize embeddings (enables cosine similarity via inner product).
208
+ show_progress: Show a progress bar during encoding.
209
+ """
210
+ if not self.chunks:
211
+ raise RuntimeError("No documents loaded. Call add_document() first.")
212
+
213
+ logger.info(f"Encoding {len(self.chunks)} chunks...")
214
+ texts = [c.text for c in self.chunks]
215
+
216
+ self.embeddings = self.model.encode(
217
+ texts,
218
+ batch_size=self.batch_size,
219
+ show_progress_bar=show_progress,
220
+ convert_to_numpy=True,
221
+ normalize_embeddings=normalize,
222
+ )
223
+
224
+ # Build FAISS inner-product index (cosine similarity when vectors are normalized)
225
+ self.index = faiss.IndexFlatIP(self.embedding_dim)
226
+ self.index.add(self.embeddings.astype(np.float32))
227
+
228
+ logger.info(f"Index built: {self.index.ntotal} vectors, dim={self.embedding_dim}")
229
+
230
+ # ------------------------------------------------------------------ #
231
+ # Core query methods
232
+ # ------------------------------------------------------------------ #
233
+
234
+ def query(self, text: str, top_k: int = 10) -> list[SimilarityResult]:
235
+ """
236
+ Find the most similar chunks to a query text.
237
+
238
+ Args:
239
+ text: Query string (sentence, phrase, or keyword in context).
240
+ top_k: Number of results to return.
241
+
242
+ Returns:
243
+ List of SimilarityResult sorted by descending similarity score.
244
+ """
245
+ self._ensure_index()
246
+
247
+ query_vec = self.model.encode(
248
+ [text], normalize_embeddings=True, convert_to_numpy=True
249
+ ).astype(np.float32)
250
+
251
+ scores, indices = self.index.search(query_vec, top_k)
252
+
253
+ results = []
254
+ for rank, (score, idx) in enumerate(zip(scores[0], indices[0])):
255
+ if idx == -1:
256
+ continue
257
+ results.append(SimilarityResult(
258
+ chunk=self.chunks[idx],
259
+ score=float(score),
260
+ rank=rank + 1,
261
+ ))
262
+ return results
263
+
264
+ def compare_texts(self, text_a: str, text_b: str) -> float:
265
+ """
266
+ Compute cosine similarity between two texts directly.
267
+
268
+ Returns:
269
+ Similarity score in [-1, 1] (typically [0, 1] for natural language).
270
+ """
271
+ vecs = self.model.encode(
272
+ [text_a, text_b], normalize_embeddings=True, convert_to_tensor=True
273
+ )
274
+ return float(util.pytorch_cos_sim(vecs[0], vecs[1]).item())
275
+
276
+ # ------------------------------------------------------------------ #
277
+ # Keyword analysis
278
+ # ------------------------------------------------------------------ #
279
+
280
+ def find_keyword_contexts(
281
+ self, keyword: str, case_sensitive: bool = False
282
+ ) -> list[KeywordContext]:
283
+ """
284
+ Find all chunks containing a keyword and return them as KeywordContext objects.
285
+
286
+ Args:
287
+ keyword: The word or phrase to search for.
288
+ case_sensitive: Whether matching is case-sensitive.
289
+
290
+ Returns:
291
+ List of KeywordContext with chunk and highlight positions.
292
+ """
293
+ if len(keyword) > 200:
294
+ raise ValueError("Keyword must be 200 characters or fewer.")
295
+ flags = 0 if case_sensitive else re.IGNORECASE
296
+ pattern = re.compile(r"\b" + re.escape(keyword) + r"\b", flags)
297
+
298
+ contexts = []
299
+ for chunk in self.chunks:
300
+ matches = list(pattern.finditer(chunk.text))
301
+ if matches:
302
+ positions = [(m.start(), m.end()) for m in matches]
303
+ contexts.append(KeywordContext(
304
+ keyword=keyword,
305
+ chunk=chunk,
306
+ highlight_positions=positions,
307
+ ))
308
+ return contexts
309
+
310
+ def analyze_keyword(
311
+ self,
312
+ keyword: str,
313
+ top_k: int = 10,
314
+ cluster_threshold: float = 0.35,
315
+ case_sensitive: bool = False,
316
+ ) -> KeywordAnalysis:
317
+ """
318
+ Analyze all contextual usages of a keyword across the corpus.
319
+
320
+ Finds every chunk containing the keyword, embeds them, clusters them
321
+ by semantic similarity (agglomerative clustering), and returns a
322
+ structured analysis with distinct meaning groups.
323
+
324
+ Args:
325
+ keyword: Word or phrase to analyze.
326
+ top_k: Max similar chunks to return per meaning cluster.
327
+ cluster_threshold: Distance threshold for clustering (lower = more clusters).
328
+ 0.35 works well for clearly distinct meanings; raise to 0.5+ to merge similar ones.
329
+ case_sensitive: Whether keyword matching is case-sensitive.
330
+
331
+ Returns:
332
+ KeywordAnalysis with meaning clusters and similarity info.
333
+ """
334
+ self._ensure_index()
335
+ contexts = self.find_keyword_contexts(keyword, case_sensitive)
336
+
337
+ if not contexts:
338
+ return KeywordAnalysis(keyword=keyword, total_occurrences=0)
339
+
340
+ # Get embeddings for keyword-containing chunks
341
+ chunk_indices = []
342
+ for ctx in contexts:
343
+ idx = self.chunks.index(ctx.chunk)
344
+ chunk_indices.append(idx)
345
+
346
+ kw_embeddings = self.embeddings[chunk_indices]
347
+
348
+ # Cluster the keyword contexts by semantic similarity
349
+ clusters = self._cluster_embeddings(kw_embeddings, threshold=cluster_threshold)
350
+
351
+ # Build meaning clusters
352
+ meaning_clusters = []
353
+ for cluster_id in sorted(set(clusters)):
354
+ member_indices = [i for i, c in enumerate(clusters) if c == cluster_id]
355
+ member_contexts = [contexts[i] for i in member_indices]
356
+ member_embeds = kw_embeddings[member_indices]
357
+
358
+ # Centroid of this cluster
359
+ centroid = member_embeds.mean(axis=0, keepdims=True).astype(np.float32)
360
+ faiss.normalize_L2(centroid)
361
+
362
+ # Find top_k most similar chunks in the full corpus to this meaning
363
+ scores, idx_arr = self.index.search(centroid, top_k)
364
+ similar = []
365
+ for rank, (score, idx) in enumerate(zip(scores[0], idx_arr[0])):
366
+ if idx == -1:
367
+ continue
368
+ similar.append(SimilarityResult(
369
+ chunk=self.chunks[idx],
370
+ score=float(score),
371
+ rank=rank + 1,
372
+ ))
373
+
374
+ meaning_clusters.append({
375
+ "cluster_id": cluster_id,
376
+ "size": len(member_indices),
377
+ "representative_text": member_contexts[0].chunk.text[:200],
378
+ "contexts": member_contexts,
379
+ "similar_passages": similar,
380
+ })
381
+
382
+ return KeywordAnalysis(
383
+ keyword=keyword,
384
+ total_occurrences=len(contexts),
385
+ meaning_clusters=meaning_clusters,
386
+ )
387
+
388
+ def batch_analyze_keywords(
389
+ self,
390
+ keywords: list[str],
391
+ top_k: int = 10,
392
+ cluster_threshold: float = 0.35,
393
+ compare_across: bool = True,
394
+ ) -> dict[str, KeywordAnalysis]:
395
+ """
396
+ Analyze multiple keywords and optionally compute cross-keyword similarities.
397
+
398
+ Args:
399
+ keywords: List of keywords to analyze.
400
+ top_k: Results per cluster.
401
+ cluster_threshold: Clustering distance threshold.
402
+ compare_across: If True, compute pairwise similarity between keyword contexts.
403
+
404
+ Returns:
405
+ Dict mapping keyword -> KeywordAnalysis.
406
+ """
407
+ results = {}
408
+ for kw in tqdm(keywords, desc="Analyzing keywords"):
409
+ results[kw] = self.analyze_keyword(kw, top_k, cluster_threshold)
410
+
411
+ if compare_across and len(keywords) > 1:
412
+ self._compute_cross_keyword_similarities(results)
413
+
414
+ return results
415
+
416
+ def _compute_cross_keyword_similarities(
417
+ self, analyses: dict[str, KeywordAnalysis]
418
+ ) -> None:
419
+ """Compute average cosine similarity between each pair of keywords' contexts."""
420
+ keyword_centroids = {}
421
+ for kw, analysis in analyses.items():
422
+ if not analysis.meaning_clusters:
423
+ continue
424
+ # Collect all context embeddings for this keyword
425
+ all_indices = []
426
+ for cluster in analysis.meaning_clusters:
427
+ for ctx in cluster["contexts"]:
428
+ idx = self.chunks.index(ctx.chunk)
429
+ all_indices.append(idx)
430
+ if all_indices:
431
+ embeds = self.embeddings[all_indices]
432
+ centroid = embeds.mean(axis=0)
433
+ norm = np.linalg.norm(centroid)
434
+ if norm > 0:
435
+ centroid = centroid / norm
436
+ keyword_centroids[kw] = centroid
437
+
438
+ # Pairwise similarities
439
+ kw_list = list(keyword_centroids.keys())
440
+ for i, kw_a in enumerate(kw_list):
441
+ sims = {}
442
+ for j, kw_b in enumerate(kw_list):
443
+ if i != j:
444
+ score = float(np.dot(keyword_centroids[kw_a], keyword_centroids[kw_b]))
445
+ sims[kw_b] = score
446
+ if kw_a in analyses:
447
+ analyses[kw_a].cross_keyword_similarities = sims
448
+
449
+ # ------------------------------------------------------------------ #
450
+ # Contextual keyword matching (the core use case)
451
+ # ------------------------------------------------------------------ #
452
+
453
+ def match_keyword_to_meaning(
454
+ self,
455
+ keyword: str,
456
+ candidate_meanings: list[str],
457
+ ) -> list[dict]:
458
+ """
459
+ Given a keyword and a list of candidate meanings (words/phrases),
460
+ find which meaning each occurrence of the keyword is closest to.
461
+
462
+ This is the core "pizza means school" use case: you provide the keyword
463
+ "pizza" and candidates ["pizza (food)", "school", "homework"], and this
464
+ method tells you which meaning each usage of "pizza" maps to.
465
+
466
+ Args:
467
+ keyword: The keyword to analyze (e.g. "pizza").
468
+ candidate_meanings: List of meaning descriptions (e.g. ["food", "school"]).
469
+
470
+ Returns:
471
+ List of dicts with keys: chunk, best_match, scores (all candidates).
472
+ """
473
+ self._ensure_index()
474
+
475
+ contexts = self.find_keyword_contexts(keyword)
476
+ if not contexts:
477
+ return []
478
+
479
+ # Embed all candidate meanings
480
+ candidate_vecs = self.model.encode(
481
+ candidate_meanings, normalize_embeddings=True, convert_to_tensor=True
482
+ )
483
+
484
+ results = []
485
+ for ctx in contexts:
486
+ # Embed the chunk containing the keyword
487
+ chunk_vec = self.model.encode(
488
+ [ctx.chunk.text], normalize_embeddings=True, convert_to_tensor=True
489
+ )
490
+
491
+ # Score against each candidate
492
+ scores = util.pytorch_cos_sim(chunk_vec, candidate_vecs)[0]
493
+ score_dict = {
494
+ meaning: float(scores[i]) for i, meaning in enumerate(candidate_meanings)
495
+ }
496
+ best = max(score_dict, key=score_dict.get)
497
+
498
+ results.append({
499
+ "chunk": ctx.chunk,
500
+ "best_match": best,
501
+ "best_score": score_dict[best],
502
+ "all_scores": score_dict,
503
+ })
504
+
505
+ return results
506
+
507
+ # ------------------------------------------------------------------ #
508
+ # Context inference (keyword → meaning words)
509
+ # ------------------------------------------------------------------ #
510
+
511
+ # Common English stopwords to exclude from context word extraction
512
+ _STOPWORDS = frozenset(
513
+ "a an the and or but in on at to for of is it that this was were be been "
514
+ "being have has had do does did will would shall should may might can could "
515
+ "not no nor so if then than too very just about above after again all also "
516
+ "am are as between both by each few from further get got he her here hers "
517
+ "herself him himself his how i its itself me more most my myself no nor "
518
+ "only other our ours ourselves out over own same she some such their theirs "
519
+ "them themselves there these they those through under until up us we what "
520
+ "when where which while who whom why with you your yours yourself yourselves "
521
+ "one two three four five six seven eight nine ten into been being because "
522
+ "during before between against without within along across behind since "
523
+ "upon around among".split()
524
+ )
525
+
526
+ def infer_keyword_meanings(
527
+ self,
528
+ keyword: str,
529
+ context_window: int = 120,
530
+ top_words: int = 8,
531
+ cluster_threshold: float = 0.35,
532
+ max_meanings: int = 10,
533
+ ) -> dict:
534
+ """
535
+ Infer what a keyword likely means based on its surrounding context words.
536
+
537
+ Finds all occurrences, clusters them by semantic similarity, then extracts
538
+ the most distinctive co-occurring words for each meaning cluster.
539
+
540
+ Args:
541
+ keyword: The keyword to analyze.
542
+ context_window: Characters around each keyword occurrence to examine.
543
+ top_words: Number of associated words to return per meaning.
544
+ cluster_threshold: Distance threshold for clustering.
545
+ max_meanings: Maximum number of meaning clusters to return.
546
+
547
+ Returns:
548
+ Dict with keyword, total_occurrences, and meanings list.
549
+ """
550
+ self._ensure_index()
551
+ contexts = self.find_keyword_contexts(keyword)
552
+
553
+ if not contexts:
554
+ return {
555
+ "keyword": keyword,
556
+ "total_occurrences": 0,
557
+ "meanings": [],
558
+ }
559
+
560
+ # Get embeddings and cluster
561
+ chunk_indices = [self.chunks.index(ctx.chunk) for ctx in contexts]
562
+ kw_embeddings = self.embeddings[chunk_indices]
563
+ clusters = self._cluster_embeddings(kw_embeddings, threshold=cluster_threshold)
564
+
565
+ total = len(contexts)
566
+ kw_lower = keyword.lower()
567
+ word_pattern = re.compile(r"[a-zA-Z]{3,}")
568
+
569
+ # Global word frequencies (across all occurrences) for TF-IDF-like scoring
570
+ global_word_counts: dict[str, int] = {}
571
+ cluster_data: dict[int, list[dict[str, int]]] = {}
572
+
573
+ for i, ctx in enumerate(contexts):
574
+ cluster_id = clusters[i]
575
+ if cluster_id not in cluster_data:
576
+ cluster_data[cluster_id] = []
577
+
578
+ # Extract context window around each keyword occurrence
579
+ local_counts: dict[str, int] = {}
580
+ for start, end in ctx.highlight_positions:
581
+ window_start = max(0, start - context_window)
582
+ window_end = min(len(ctx.chunk.text), end + context_window)
583
+ window_text = ctx.chunk.text[window_start:window_end].lower()
584
+
585
+ for word_match in word_pattern.finditer(window_text):
586
+ w = word_match.group()
587
+ if w == kw_lower or w in self._STOPWORDS or len(w) < 3:
588
+ continue
589
+ local_counts[w] = local_counts.get(w, 0) + 1
590
+ global_word_counts[w] = global_word_counts.get(w, 0) + 1
591
+
592
+ cluster_data[cluster_id].append(local_counts)
593
+
594
+ # Build meanings from clusters
595
+ meanings = []
596
+ for cluster_id in sorted(cluster_data.keys()):
597
+ members = cluster_data[cluster_id]
598
+ count = len(members)
599
+ confidence = round(count / total, 3)
600
+
601
+ # Aggregate word counts for this cluster
602
+ cluster_word_counts: dict[str, int] = {}
603
+ for member_counts in members:
604
+ for w, c in member_counts.items():
605
+ cluster_word_counts[w] = cluster_word_counts.get(w, 0) + c
606
+
607
+ # Score words: cluster frequency weighted by distinctiveness
608
+ # (how much more frequent in this cluster vs globally)
609
+ num_clusters = len(cluster_data)
610
+ word_scores: dict[str, float] = {}
611
+ for w, cluster_count in cluster_word_counts.items():
612
+ global_count = global_word_counts.get(w, 1)
613
+ # TF in cluster * IDF-like distinctiveness
614
+ tf = cluster_count / max(sum(cluster_word_counts.values()), 1)
615
+ distinctiveness = (cluster_count / global_count) if num_clusters > 1 else 1.0
616
+ word_scores[w] = tf * (0.5 + 0.5 * distinctiveness)
617
+
618
+ # Get top words
619
+ sorted_words = sorted(word_scores.items(), key=lambda x: -x[1])[:top_words]
620
+ associated_words = [
621
+ {"word": w, "score": round(s, 4)} for w, s in sorted_words
622
+ ]
623
+
624
+ # Get example context snippets
625
+ example_contexts = []
626
+ member_indices = [j for j, c in enumerate(clusters) if c == cluster_id]
627
+ for j in member_indices[:3]: # max 3 examples
628
+ ctx = contexts[j]
629
+ if ctx.highlight_positions:
630
+ start, end = ctx.highlight_positions[0]
631
+ snippet_start = max(0, start - 80)
632
+ snippet_end = min(len(ctx.chunk.text), end + 80)
633
+ snippet = ctx.chunk.text[snippet_start:snippet_end].strip()
634
+ if snippet_start > 0:
635
+ snippet = "..." + snippet
636
+ if snippet_end < len(ctx.chunk.text):
637
+ snippet = snippet + "..."
638
+ example_contexts.append({
639
+ "doc_id": ctx.chunk.doc_id,
640
+ "snippet": snippet,
641
+ })
642
+
643
+ meanings.append({
644
+ "cluster_id": cluster_id,
645
+ "occurrences": count,
646
+ "confidence": confidence,
647
+ "associated_words": associated_words,
648
+ "example_contexts": example_contexts,
649
+ })
650
+
651
+ # Sort by confidence descending
652
+ meanings.sort(key=lambda m: -m["confidence"])
653
+ meanings = meanings[:max_meanings]
654
+
655
+ return {
656
+ "keyword": keyword,
657
+ "total_occurrences": total,
658
+ "meanings": meanings,
659
+ }
660
+
661
+ # ------------------------------------------------------------------ #
662
+ # Utilities
663
+ # ------------------------------------------------------------------ #
664
+
665
+ def _cluster_embeddings(
666
+ self, embeddings: np.ndarray, threshold: float = 0.35
667
+ ) -> list[int]:
668
+ """Cluster embeddings using agglomerative clustering with cosine distance."""
669
+ if len(embeddings) == 1:
670
+ return [0]
671
+
672
+ clustering = AgglomerativeClustering(
673
+ n_clusters=None,
674
+ distance_threshold=threshold,
675
+ metric="cosine",
676
+ linkage="average",
677
+ )
678
+ labels = clustering.fit_predict(embeddings)
679
+ return labels.tolist()
680
+
681
+ def similar_words(self, word: str, top_k: int = 10) -> list[dict]:
682
+ """
683
+ Find words that appear in similar contexts using transformer embeddings.
684
+
685
+ Extracts unique words from the corpus, encodes them, and finds nearest
686
+ neighbors by cosine similarity. Unlike Word2Vec (one static vector per word),
687
+ this uses the transformer's contextual understanding.
688
+
689
+ Args:
690
+ word: Target word.
691
+ top_k: Number of similar words to return.
692
+
693
+ Returns:
694
+ List of {"word": str, "score": float} sorted by descending similarity.
695
+ """
696
+ self._ensure_index()
697
+
698
+ word_pattern = re.compile(r"[a-zA-Z]{3,}")
699
+ word_lower = word.lower()
700
+
701
+ # Collect unique words from corpus (skip stopwords + the query word itself)
702
+ vocab: set[str] = set()
703
+ for chunk in self.chunks:
704
+ for match in word_pattern.finditer(chunk.text):
705
+ w = match.group().lower()
706
+ if w != word_lower and w not in self._STOPWORDS:
707
+ vocab.add(w)
708
+
709
+ if not vocab:
710
+ return []
711
+
712
+ vocab_list = sorted(vocab)
713
+ logger.info("Similar words: encoding %d vocabulary words for '%s'", len(vocab_list), word)
714
+
715
+ # Encode the query word and all vocab words
716
+ all_texts = [word] + vocab_list
717
+ embeddings = self.model.encode(
718
+ all_texts,
719
+ batch_size=self.batch_size,
720
+ show_progress_bar=False,
721
+ convert_to_numpy=True,
722
+ normalize_embeddings=True,
723
+ )
724
+
725
+ query_vec = embeddings[0:1]
726
+ vocab_vecs = embeddings[1:]
727
+
728
+ # Compute cosine similarities
729
+ scores = (vocab_vecs @ query_vec.T).flatten()
730
+ top_indices = np.argsort(scores)[::-1][:top_k]
731
+
732
+ return [
733
+ {"word": vocab_list[i], "score": round(float(scores[i]), 4)}
734
+ for i in top_indices
735
+ ]
736
+
737
+ def _ensure_index(self):
738
+ if self.index is None:
739
+ raise RuntimeError("Index not built. Call build_index() first.")
740
+
741
+ def get_stats(self) -> dict:
742
+ """Return corpus statistics."""
743
+ return {
744
+ "total_chunks": len(self.chunks),
745
+ "total_documents": len(self._doc_ids),
746
+ "document_ids": sorted(self._doc_ids),
747
+ "index_built": self.index is not None,
748
+ "embedding_dim": self.embedding_dim,
749
+ "model_name": self._model_name,
750
+ }
751
+
752
+ # ------------------------------------------------------------------ #
753
+ # Persistence (save / load engine state to disk)
754
+ # ------------------------------------------------------------------ #
755
+
756
+ def save(self, directory: str) -> dict:
757
+ """
758
+ Save the full engine state (chunks, embeddings, FAISS index) to disk.
759
+
760
+ Args:
761
+ directory: Path to save directory (created if needed).
762
+
763
+ Returns:
764
+ Stats dict with what was saved.
765
+ """
766
+ import json, pickle
767
+
768
+ save_dir = Path(directory)
769
+ save_dir.mkdir(parents=True, exist_ok=True)
770
+
771
+ # Save chunks
772
+ with open(save_dir / "chunks.pkl", "wb") as f:
773
+ pickle.dump(self.chunks, f)
774
+
775
+ # Save metadata
776
+ meta = {
777
+ "model_name": self._model_name,
778
+ "chunk_size": self.chunk_size,
779
+ "chunk_overlap": self.chunk_overlap,
780
+ "batch_size": self.batch_size,
781
+ "embedding_dim": self.embedding_dim,
782
+ "doc_ids": sorted(self._doc_ids),
783
+ }
784
+ with open(save_dir / "meta.json", "w") as f:
785
+ json.dump(meta, f, indent=2)
786
+
787
+ # Save embeddings + FAISS index
788
+ saved_index = False
789
+ if self.embeddings is not None:
790
+ np.save(save_dir / "embeddings.npy", self.embeddings)
791
+ if self.index is not None:
792
+ faiss.write_index(self.index, str(save_dir / "index.faiss"))
793
+ saved_index = True
794
+
795
+ logger.info("Engine saved to %s: %d chunks, %d docs, index=%s",
796
+ directory, len(self.chunks), len(self._doc_ids), saved_index)
797
+ return {
798
+ "directory": str(save_dir),
799
+ "chunks": len(self.chunks),
800
+ "documents": len(self._doc_ids),
801
+ "index_saved": saved_index,
802
+ }
803
+
804
+ @classmethod
805
+ def load(cls, directory: str, device: Optional[str] = None) -> "ContextualSimilarityEngine":
806
+ """
807
+ Load a previously saved engine state from disk.
808
+
809
+ Args:
810
+ directory: Path to the saved state directory.
811
+ device: PyTorch device override.
812
+
813
+ Returns:
814
+ A fully restored ContextualSimilarityEngine instance.
815
+ """
816
+ import json, pickle
817
+
818
+ save_dir = Path(directory)
819
+ if not save_dir.is_dir():
820
+ raise FileNotFoundError(f"No saved state at {directory}")
821
+
822
+ # Load metadata
823
+ with open(save_dir / "meta.json") as f:
824
+ meta = json.load(f)
825
+
826
+ # Create engine (loads the model)
827
+ engine = cls(
828
+ model_name=meta["model_name"],
829
+ chunk_size=meta["chunk_size"],
830
+ chunk_overlap=meta["chunk_overlap"],
831
+ device=device,
832
+ batch_size=meta["batch_size"],
833
+ )
834
+
835
+ # Restore chunks
836
+ with open(save_dir / "chunks.pkl", "rb") as f:
837
+ engine.chunks = pickle.load(f)
838
+ engine._doc_ids = set(meta["doc_ids"])
839
+
840
+ # Restore embeddings + index
841
+ emb_path = save_dir / "embeddings.npy"
842
+ idx_path = save_dir / "index.faiss"
843
+ if emb_path.exists():
844
+ engine.embeddings = np.load(emb_path)
845
+ if idx_path.exists():
846
+ engine.index = faiss.read_index(str(idx_path))
847
+
848
+ logger.info("Engine loaded from %s: %d chunks, %d docs, index=%s",
849
+ directory, len(engine.chunks), len(engine._doc_ids), engine.index is not None)
850
+ return engine
data_loader.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Epstein Files Dataset Loader
3
+
4
+ Loads data from two HuggingFace sources:
5
+ 1. teyler/epstein-files-20k — raw OCR text (2.1M rows, filename + text)
6
+ 2. devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB — pre-computed
7
+ all-MiniLM-L6-v2 embeddings in ChromaDB format
8
+
9
+ Both can feed directly into the ContextualSimilarityEngine pipeline.
10
+ """
11
+
12
+ import logging
13
+ import re
14
+ import time
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ import numpy as np
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # HuggingFace dataset identifiers
23
+ RAW_DATASET = "teyler/epstein-files-20k"
24
+ EMBEDDINGS_DATASET = "devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB"
25
+
26
+
27
+ def load_raw_dataset(
28
+ max_docs: Optional[int] = None,
29
+ min_text_length: int = 100,
30
+ source_filter: Optional[str] = None,
31
+ ) -> list[dict]:
32
+ """
33
+ Load raw Epstein Files from HuggingFace.
34
+
35
+ Args:
36
+ max_docs: Limit number of documents loaded (None = all ~2.1M).
37
+ min_text_length: Skip documents shorter than this.
38
+ source_filter: Filter by filename prefix, e.g. "TEXT-" or "IMAGES-".
39
+
40
+ Returns:
41
+ List of {"doc_id": str, "text": str, "filename": str}
42
+ """
43
+ from datasets import load_dataset
44
+
45
+ t0 = time.time()
46
+ logger.info(f"Loading {RAW_DATASET} from HuggingFace...")
47
+
48
+ ds = load_dataset(RAW_DATASET, split="train")
49
+ docs = []
50
+
51
+ for i, row in enumerate(ds):
52
+ if max_docs and len(docs) >= max_docs:
53
+ break
54
+
55
+ text = (row.get("text") or "").strip()
56
+ filename = row.get("filename") or f"doc_{i}"
57
+
58
+ if len(text) < min_text_length:
59
+ continue
60
+
61
+ if source_filter and not filename.startswith(source_filter):
62
+ continue
63
+
64
+ doc_id = Path(filename).stem
65
+ docs.append({"doc_id": doc_id, "text": text, "filename": filename})
66
+
67
+ elapsed = time.time() - t0
68
+ logger.info(f"Loaded {len(docs)} documents in {elapsed:.1f}s")
69
+ return docs
70
+
71
+
72
+ def load_raw_to_engine(
73
+ engine,
74
+ max_docs: Optional[int] = 500,
75
+ min_text_length: int = 100,
76
+ source_filter: Optional[str] = None,
77
+ build_index: bool = True,
78
+ ) -> dict:
79
+ """
80
+ Load raw dataset directly into a ContextualSimilarityEngine.
81
+
82
+ Args:
83
+ engine: ContextualSimilarityEngine instance (must be initialized).
84
+ max_docs: Limit documents to load.
85
+ min_text_length: Skip short documents.
86
+ source_filter: Filter by filename prefix.
87
+ build_index: Whether to build FAISS index after loading.
88
+
89
+ Returns:
90
+ Stats dict with counts and timing.
91
+ """
92
+ t0 = time.time()
93
+ docs = load_raw_dataset(max_docs, min_text_length, source_filter)
94
+
95
+ total_chunks = 0
96
+ skipped = 0
97
+ for doc in docs:
98
+ try:
99
+ chunks = engine.add_document(doc["doc_id"], doc["text"])
100
+ total_chunks += len(chunks)
101
+ except ValueError as e:
102
+ logger.warning("Skipped document '%s': %s", doc["doc_id"], e)
103
+ skipped += 1
104
+
105
+ if build_index and total_chunks > 0:
106
+ engine.build_index(show_progress=True)
107
+
108
+ elapsed = time.time() - t0
109
+ return {
110
+ "documents_loaded": len(docs) - skipped,
111
+ "documents_skipped": skipped,
112
+ "total_chunks": total_chunks,
113
+ "index_built": build_index and total_chunks > 0,
114
+ "seconds": round(elapsed, 2),
115
+ }
116
+
117
+
118
+ def load_chromadb_embeddings(
119
+ download_dir: str = "./chroma_epstein",
120
+ ) -> dict:
121
+ """
122
+ Download and load the pre-computed ChromaDB embeddings.
123
+
124
+ Returns:
125
+ Dict with "texts", "embeddings", "metadatas", "ids", and stats.
126
+ """
127
+ import chromadb
128
+ from huggingface_hub import snapshot_download
129
+
130
+ t0 = time.time()
131
+ logger.info(f"Downloading {EMBEDDINGS_DATASET} from HuggingFace...")
132
+
133
+ # This repo contains ChromaDB persistence files (not standard datasets),
134
+ # so we use snapshot_download instead of load_dataset.
135
+ local_path = snapshot_download(
136
+ repo_id=EMBEDDINGS_DATASET,
137
+ repo_type="dataset",
138
+ local_dir=download_dir,
139
+ )
140
+
141
+ # Find the chroma_db directory
142
+ chroma_dir = None
143
+ for candidate in [
144
+ Path(local_path) / "chroma_db",
145
+ Path(local_path),
146
+ ]:
147
+ if (candidate / "chroma.sqlite3").exists():
148
+ chroma_dir = str(candidate)
149
+ break
150
+
151
+ if not chroma_dir:
152
+ raise FileNotFoundError(
153
+ f"ChromaDB files not found in {local_path}. "
154
+ f"Expected chroma.sqlite3 in the download."
155
+ )
156
+
157
+ # Open ChromaDB
158
+ client = chromadb.PersistentClient(path=chroma_dir)
159
+ collections = client.list_collections()
160
+ if not collections:
161
+ raise ValueError("No collections found in ChromaDB.")
162
+
163
+ collection = collections[0]
164
+ count = collection.count()
165
+ logger.info(f"ChromaDB collection '{collection.name}': {count} vectors")
166
+
167
+ elapsed = time.time() - t0
168
+ return {
169
+ "chroma_dir": chroma_dir,
170
+ "collection_name": collection.name,
171
+ "total_vectors": count,
172
+ "seconds": round(elapsed, 2),
173
+ "_collection": collection,
174
+ "_client": client,
175
+ }
176
+
177
+
178
+ def import_chromadb_to_engine(
179
+ engine,
180
+ max_chunks: Optional[int] = None,
181
+ batch_size: int = 1000,
182
+ ) -> dict:
183
+ """
184
+ Import pre-computed ChromaDB embeddings into the engine's FAISS index.
185
+
186
+ Since both use all-MiniLM-L6-v2 (384-dim), we can directly import
187
+ the vectors without re-encoding.
188
+
189
+ Args:
190
+ engine: ContextualSimilarityEngine (must be initialized with all-MiniLM-L6-v2).
191
+ max_chunks: Limit vectors to import (None = all).
192
+ batch_size: How many vectors to fetch from ChromaDB at a time.
193
+
194
+ Returns:
195
+ Stats dict.
196
+ """
197
+ t0 = time.time()
198
+ chroma_data = load_chromadb_embeddings()
199
+ collection = chroma_data["_collection"]
200
+ total = chroma_data["total_vectors"]
201
+
202
+ if max_chunks:
203
+ total = min(total, max_chunks)
204
+
205
+ # Fetch in batches
206
+ all_texts = []
207
+ all_embeddings = []
208
+ all_sources = []
209
+
210
+ offset = 0
211
+ while offset < total:
212
+ limit = min(batch_size, total - offset)
213
+ results = collection.get(
214
+ limit=limit,
215
+ offset=offset,
216
+ include=["embeddings", "documents", "metadatas"],
217
+ )
218
+
219
+ if not results["ids"]:
220
+ break
221
+
222
+ for i, doc_id in enumerate(results["ids"]):
223
+ text = results["documents"][i] if results["documents"] is not None else ""
224
+ embedding = results["embeddings"][i] if results["embeddings"] is not None else None
225
+ metadata = results["metadatas"][i] if results["metadatas"] is not None else {}
226
+ source = metadata.get("source", f"chunk_{offset + i}")
227
+
228
+ if text and embedding is not None:
229
+ all_texts.append(text)
230
+ all_embeddings.append(embedding)
231
+ all_sources.append(source)
232
+
233
+ offset += len(results["ids"])
234
+ logger.info(f"Fetched {offset}/{total} vectors from ChromaDB")
235
+
236
+ # Group texts by source document and add to engine
237
+ doc_chunks = {}
238
+ for text, source in zip(all_texts, all_sources):
239
+ stem = Path(source).stem if source else "unknown"
240
+ if stem not in doc_chunks:
241
+ doc_chunks[stem] = []
242
+ doc_chunks[stem].append(text)
243
+
244
+ docs_added = 0
245
+ chunks_added = 0
246
+ for doc_id, texts in doc_chunks.items():
247
+ combined = "\n\n".join(texts)
248
+ try:
249
+ chunks = engine.add_document(doc_id, combined)
250
+ chunks_added += len(chunks)
251
+ docs_added += 1
252
+ except ValueError as e:
253
+ logger.warning("Skipped ChromaDB document '%s': %s", doc_id, e)
254
+
255
+ if chunks_added > 0:
256
+ engine.build_index(show_progress=True)
257
+
258
+ elapsed = time.time() - t0
259
+ return {
260
+ "source": "chromadb_embeddings",
261
+ "chromadb_vectors": len(all_embeddings),
262
+ "documents_created": docs_added,
263
+ "chunks_indexed": chunks_added,
264
+ "index_built": chunks_added > 0,
265
+ "seconds": round(elapsed, 2),
266
+ }
267
+
268
+
269
+ def get_dataset_info() -> dict:
270
+ """Return metadata about available datasets (no download)."""
271
+ return {
272
+ "raw_texts": {
273
+ "dataset_id": RAW_DATASET,
274
+ "url": f"https://huggingface.co/datasets/{RAW_DATASET}",
275
+ "description": "2.1M OCR text documents from U.S. House Oversight Committee Epstein Files release",
276
+ "columns": ["filename", "text"],
277
+ "size_mb": 106,
278
+ },
279
+ "embeddings": {
280
+ "dataset_id": EMBEDDINGS_DATASET,
281
+ "url": f"https://huggingface.co/datasets/{EMBEDDINGS_DATASET}",
282
+ "description": "Pre-computed all-MiniLM-L6-v2 embeddings in ChromaDB format (~100K+ chunks)",
283
+ "model": "all-MiniLM-L6-v2",
284
+ "vector_dim": 384,
285
+ },
286
+ }
demo.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Demo: Word2Vec vs Transformer — side by side comparison.
3
+
4
+ Run: python demo.py
5
+ """
6
+
7
+ import json
8
+ from contextual_similarity import ContextualSimilarityEngine
9
+ from word2vec_baseline import Word2VecEngine
10
+ from evaluation import Evaluator, GroundTruthEntry
11
+
12
+ # ------------------------------------------------------------------ #
13
+ # Sample corpus
14
+ # ------------------------------------------------------------------ #
15
+
16
+ DOCS = {
17
+ "secret_language": """
18
+ The kids in the neighborhood had developed their own secret language. When they said
19
+ "pizza" they actually meant "school". So when Tommy said "I love pizza so much, I go
20
+ there every day", he really meant he loved going to school. His friend Sarah would say
21
+ "pizza gives me homework" and everyone in the group understood she was talking about school.
22
+
23
+ The code words extended further. "Pepperoni" meant math class, because it was their
24
+ favorite topping but also the hardest subject. When Jake complained about "too much
25
+ pepperoni on my pizza", the group knew he was struggling with math at school.
26
+
27
+ Their parents were confused. "Why do you kids talk about pizza all the time?" asked
28
+ Tommy's mom. The kids just giggled. Their secret language was working perfectly.
29
+ """,
30
+ "real_pizza": """
31
+ Meanwhile, across town, Maria genuinely loved pizza. She worked at Giuseppe's Pizzeria
32
+ and made the best margherita in the city. Her pizza dough recipe used tipo 00 flour,
33
+ San Marzano tomatoes, and fresh mozzarella. Every Saturday, she would fire up the
34
+ wood-burning oven and create masterpieces.
35
+
36
+ Maria's customers raved about her pizza. "This pizza is amazing, the crust is perfectly
37
+ crispy!" they would say. The restaurant was always full. Pizza was Maria's life, her
38
+ passion, and her livelihood. She dreamed of opening more pizza restaurants across the country.
39
+ """,
40
+ "school_board": """
41
+ The local school board met to discuss improving education in the district. Principal
42
+ Johnson presented data showing that students who attended school regularly performed
43
+ better on standardized tests. "School attendance is directly correlated with academic
44
+ success," she explained.
45
+
46
+ The board discussed new programs to make school more engaging for students. They proposed
47
+ adding more extracurricular activities, updating the curriculum, and hiring additional
48
+ teachers. "We need to make school a place where students want to be," said board member
49
+ Williams.
50
+ """,
51
+ "misunderstanding": """
52
+ One day, Tommy's mom overheard a phone conversation. Tommy said to his friend, "I really
53
+ don't want to go to pizza tomorrow. The pizza test is going to be so hard." His mom was
54
+ bewildered - what kind of test does a pizzeria give?
55
+
56
+ She called Sarah's mom, who had noticed similar strange statements. "Sarah told me she
57
+ got an A on her pizza report. Since when do pizza places give grades?" The parents
58
+ decided to investigate.
59
+
60
+ When they finally figured out the code, they laughed. "So all this time, when you said
61
+ you hated Monday pizza, you meant you hated going to school on Mondays?" Tommy nodded
62
+ sheepishly.
63
+ """,
64
+ }
65
+
66
+ COMPARE_PAIRS = [
67
+ ("I love pizza so much", "I love school so much"),
68
+ ("pizza gives me homework", "school gives me homework"),
69
+ ("pizza gives me homework", "fresh mozzarella on pizza"),
70
+ ("The pizza test is hard", "The school exam is difficult"),
71
+ ("too much pepperoni on my pizza", "math class is too hard"),
72
+ ]
73
+
74
+
75
+ def main():
76
+ # ================================================================ #
77
+ # Build both engines on the same corpus
78
+ # ================================================================ #
79
+ print("=" * 70)
80
+ print("Loading models...")
81
+ print("=" * 70)
82
+
83
+ # Transformer engine
84
+ transformer = ContextualSimilarityEngine(
85
+ model_name="all-MiniLM-L6-v2",
86
+ chunk_size=400,
87
+ chunk_overlap=80,
88
+ )
89
+ for doc_id, text in DOCS.items():
90
+ transformer.add_document(doc_id, text)
91
+ transformer.build_index(show_progress=False)
92
+ print(f"Transformer: {transformer.get_stats()['total_chunks']} chunks, "
93
+ f"dim={transformer.embedding_dim}")
94
+
95
+ # Word2Vec engine
96
+ w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
97
+ for doc_id, text in DOCS.items():
98
+ w2v.add_document(doc_id, text)
99
+ stats = w2v.build_index()
100
+ print(f"Word2Vec: {stats['sentences']} sentences, "
101
+ f"vocab={stats['vocab_size']}, dim={stats['vector_size']}")
102
+
103
+ # ================================================================ #
104
+ # 1. Text similarity comparison
105
+ # ================================================================ #
106
+ print("\n" + "=" * 70)
107
+ print("1. TEXT SIMILARITY — same pairs, both models")
108
+ print("=" * 70)
109
+ print(f"\n {'Text A':<35} {'Text B':<35} {'W2V':>6} {'Trans':>6} {'Winner'}")
110
+ print(" " + "-" * 95)
111
+
112
+ for a, b in COMPARE_PAIRS:
113
+ w2v_score = w2v.compare_texts(a, b)
114
+ tr_score = transformer.compare_texts(a, b)
115
+ winner = "W2V" if abs(w2v_score) > abs(tr_score) else "TRANS"
116
+ print(f" {a:<35} {b:<35} {w2v_score:>6.3f} {tr_score:>6.3f} {winner}")
117
+
118
+ # ================================================================ #
119
+ # 2. Word-level similarity (Word2Vec only — transformers don't do this)
120
+ # ================================================================ #
121
+ print("\n" + "=" * 70)
122
+ print("2. WORD-LEVEL SIMILARITY (Word2Vec only)")
123
+ print(" Word2Vec gives ONE vector per word — no context awareness")
124
+ print("=" * 70)
125
+
126
+ for word in ["pizza", "school", "homework", "pepperoni"]:
127
+ similar = w2v.most_similar_words(word, top_k=5)
128
+ if similar:
129
+ top = ", ".join(f"{w}({s:.2f})" for w, s in similar)
130
+ print(f" {word:>12} -> {top}")
131
+
132
+ print(f"\n Word2Vec word pairs:")
133
+ for a, b in [("pizza", "school"), ("pizza", "homework"), ("pizza", "cheese"),
134
+ ("school", "homework"), ("pepperoni", "math")]:
135
+ score = w2v.word_similarity(a, b)
136
+ print(f" {a} <-> {b}: {score:.4f}")
137
+
138
+ # ================================================================ #
139
+ # 3. Semantic search comparison
140
+ # ================================================================ #
141
+ print("\n" + "=" * 70)
142
+ print("3. SEMANTIC SEARCH — 'a place where children learn and take tests'")
143
+ print("=" * 70)
144
+
145
+ query = "a place where children learn and take tests"
146
+
147
+ print("\n Transformer results:")
148
+ for r in transformer.query(query, top_k=3):
149
+ print(f" #{r.rank} ({r.score:.4f}) [{r.chunk.doc_id}] {r.chunk.text[:80]}...")
150
+
151
+ print("\n Word2Vec results:")
152
+ for r in w2v.query(query, top_k=3):
153
+ print(f" #{r.rank} ({r.score:.4f}) [{r.doc_id}] {r.text[:80]}...")
154
+
155
+ # ================================================================ #
156
+ # 4. The core test: does "pizza" mean "school" or "food"?
157
+ # ================================================================ #
158
+ print("\n" + "=" * 70)
159
+ print("4. KEYWORD MEANING MATCHING — 'pizza' -> food or school?")
160
+ print(" Transformer uses full passage context. Word2Vec averages word vectors.")
161
+ print("=" * 70)
162
+
163
+ candidates = [
164
+ "Italian food, restaurant, cooking, dough and cheese",
165
+ "School, education, academic activities, homework and tests",
166
+ ]
167
+
168
+ print("\n Transformer (match_keyword_to_meaning):")
169
+ matches = transformer.match_keyword_to_meaning("pizza", candidates)
170
+ for m in matches:
171
+ doc = m["chunk"].doc_id
172
+ best = m["best_match"][:40]
173
+ scores = " | ".join(f"{c[:20]}={s:.3f}" for c, s in m["all_scores"].items())
174
+ print(f" [{doc:>20}] -> {best:<40} ({scores})")
175
+
176
+ print("\n Word2Vec (sentence-level similarity to candidates):")
177
+ # Replicate the same logic with Word2Vec
178
+ import re
179
+ for doc_id, text in DOCS.items():
180
+ sents = re.split(r"(?<=[.!?])\s+", text.strip())
181
+ for sent in sents:
182
+ if re.search(r"\bpizza\b", sent, re.IGNORECASE) and len(sent.split()) >= 5:
183
+ scores = {c: w2v.compare_texts(sent, c) for c in candidates}
184
+ best = max(scores, key=scores.get)
185
+ best_label = best[:40]
186
+ score_str = " | ".join(f"{c[:20]}={s:.3f}" for c, s in scores.items())
187
+ print(f" [{doc_id:>20}] -> {best_label:<40} ({score_str})")
188
+ break # one per doc for brevity
189
+
190
+ # ================================================================ #
191
+ # 5. Clustering comparison
192
+ # ================================================================ #
193
+ print("\n" + "=" * 70)
194
+ print("5. KEYWORD CLUSTERING — can the model separate meanings of 'pizza'?")
195
+ print("=" * 70)
196
+
197
+ analysis = transformer.analyze_keyword("pizza", top_k=2, cluster_threshold=0.4)
198
+ print(f"\n Transformer: {analysis.total_occurrences} occurrences -> "
199
+ f"{len(analysis.meaning_clusters)} clusters")
200
+ for c in analysis.meaning_clusters:
201
+ docs = set(ctx.chunk.doc_id for ctx in c["contexts"])
202
+ print(f" Cluster {c['cluster_id']} ({c['size']} hits, docs: {docs})")
203
+ print(f" Example: {c['representative_text'][:100]}...")
204
+
205
+ print(f"\n Word2Vec: cannot cluster by meaning (same word = same vector always)")
206
+ print(f" 'pizza' has exactly ONE embedding regardless of context")
207
+
208
+ # ================================================================ #
209
+ # Summary
210
+ # ================================================================ #
211
+ print("\n" + "=" * 70)
212
+ print("SUMMARY")
213
+ print("=" * 70)
214
+ print("""
215
+ Word2Vec:
216
+ + Fast to train on small corpus
217
+ + Shows which words co-occur (word-level neighbors)
218
+ - ONE vector per word — "pizza" is always "pizza"
219
+ - Cannot distinguish "pizza = food" from "pizza = school"
220
+ - Sentence similarity is just averaged word vectors (lossy)
221
+
222
+ Transformer (SentenceTransformers):
223
+ + Full sentence/passage context — same word gets different embeddings
224
+ + Can cluster "pizza" into food vs school meanings
225
+ + Pretrained on massive data — understands language out of the box
226
+ + FAISS enables fast search over large corpora
227
+ - Larger model (~80MB vs ~1MB for Word2Vec)
228
+ - Slower inference (still <100ms per query)
229
+ """)
230
+
231
+
232
+ if __name__ == "__main__":
233
+ main()
docker-compose.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ build: .
4
+ ports:
5
+ - "8000:8000"
6
+ volumes:
7
+ # Persist HuggingFace model cache between restarts
8
+ - hf-cache:/data/huggingface
9
+ # Persist engine state and trained models
10
+ - engine-state:/data/engine_state
11
+ - ./trained_model:/data/trained_model
12
+ environment:
13
+ - HOST=0.0.0.0
14
+ - PORT=8000
15
+
16
+ volumes:
17
+ hf-cache:
18
+ engine-state:
evaluation.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation Pipeline for Contextual Similarity Engine
3
+
4
+ Provides metrics and benchmarks to assess the quality of contextual
5
+ keyword matching:
6
+ - Cosine similarity distributions
7
+ - Precision@K and Recall@K for retrieval
8
+ - Normalized Mutual Information (NMI) for clustering quality
9
+ - Mean Reciprocal Rank (MRR) for ranking quality
10
+ - Keyword disambiguation accuracy against ground truth
11
+ - Full evaluation reports with summary statistics
12
+ """
13
+
14
+ import json
15
+ import logging
16
+ import time
17
+ from dataclasses import dataclass, field, asdict
18
+ from pathlib import Path
19
+ from typing import Optional
20
+
21
+ import numpy as np
22
+ from sklearn.metrics import (
23
+ normalized_mutual_info_score,
24
+ adjusted_rand_score,
25
+ precision_score,
26
+ recall_score,
27
+ f1_score,
28
+ confusion_matrix,
29
+ )
30
+
31
+ from contextual_similarity import ContextualSimilarityEngine, KeywordAnalysis
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ # ------------------------------------------------------------------ #
37
+ # Data structures
38
+ # ------------------------------------------------------------------ #
39
+
40
+ @dataclass
41
+ class GroundTruthEntry:
42
+ """A single labeled keyword occurrence for evaluation."""
43
+ keyword: str
44
+ text: str # The passage/sentence containing the keyword
45
+ true_meaning: str # The actual intended meaning label
46
+
47
+
48
+ @dataclass
49
+ class RetrievalMetrics:
50
+ """Metrics for a single retrieval query."""
51
+ query: str
52
+ precision_at_k: dict[int, float] = field(default_factory=dict) # k -> P@k
53
+ recall_at_k: dict[int, float] = field(default_factory=dict) # k -> R@k
54
+ mrr: float = 0.0 # Mean Reciprocal Rank
55
+ ndcg_at_k: dict[int, float] = field(default_factory=dict) # k -> NDCG@k
56
+ avg_similarity: float = 0.0
57
+ top_score: float = 0.0
58
+
59
+
60
+ @dataclass
61
+ class ClusteringMetrics:
62
+ """Metrics for clustering quality against ground truth."""
63
+ keyword: str
64
+ nmi: float = 0.0 # Normalized Mutual Information
65
+ ari: float = 0.0 # Adjusted Rand Index
66
+ num_predicted_clusters: int = 0
67
+ num_true_clusters: int = 0
68
+ cluster_sizes: list[int] = field(default_factory=list)
69
+
70
+
71
+ @dataclass
72
+ class DisambiguationMetrics:
73
+ """Metrics for keyword meaning disambiguation."""
74
+ keyword: str
75
+ accuracy: float = 0.0
76
+ weighted_f1: float = 0.0
77
+ per_meaning_precision: dict[str, float] = field(default_factory=dict)
78
+ per_meaning_recall: dict[str, float] = field(default_factory=dict)
79
+ per_meaning_f1: dict[str, float] = field(default_factory=dict)
80
+ confusion: Optional[list] = None # confusion matrix as nested list
81
+ total_samples: int = 0
82
+
83
+
84
+ @dataclass
85
+ class EvaluationReport:
86
+ """Complete evaluation report."""
87
+ timestamp: str = ""
88
+ model_name: str = ""
89
+ corpus_stats: dict = field(default_factory=dict)
90
+ retrieval_metrics: list[RetrievalMetrics] = field(default_factory=list)
91
+ clustering_metrics: list[ClusteringMetrics] = field(default_factory=list)
92
+ disambiguation_metrics: list[DisambiguationMetrics] = field(default_factory=list)
93
+ similarity_distribution: dict = field(default_factory=dict)
94
+ timing: dict = field(default_factory=dict)
95
+
96
+ def summary(self) -> dict:
97
+ """Return a concise summary of the evaluation."""
98
+ summary = {
99
+ "model": self.model_name,
100
+ "corpus": self.corpus_stats,
101
+ "timing": self.timing,
102
+ }
103
+
104
+ if self.retrieval_metrics:
105
+ avg_mrr = float(np.mean([m.mrr for m in self.retrieval_metrics]))
106
+ avg_p5 = float(np.mean([m.precision_at_k.get(5, 0) for m in self.retrieval_metrics]))
107
+ avg_p10 = float(np.mean([m.precision_at_k.get(10, 0) for m in self.retrieval_metrics]))
108
+ summary["retrieval"] = {
109
+ "mean_mrr": round(avg_mrr, 4),
110
+ "mean_precision_at_5": round(avg_p5, 4),
111
+ "mean_precision_at_10": round(avg_p10, 4),
112
+ "num_queries": len(self.retrieval_metrics),
113
+ }
114
+
115
+ if self.clustering_metrics:
116
+ avg_nmi = float(np.mean([m.nmi for m in self.clustering_metrics]))
117
+ avg_ari = float(np.mean([m.ari for m in self.clustering_metrics]))
118
+ summary["clustering"] = {
119
+ "mean_nmi": round(avg_nmi, 4),
120
+ "mean_ari": round(avg_ari, 4),
121
+ "num_keywords": len(self.clustering_metrics),
122
+ }
123
+
124
+ if self.disambiguation_metrics:
125
+ avg_acc = float(np.mean([m.accuracy for m in self.disambiguation_metrics]))
126
+ avg_f1 = float(np.mean([m.weighted_f1 for m in self.disambiguation_metrics]))
127
+ summary["disambiguation"] = {
128
+ "mean_accuracy": round(avg_acc, 4),
129
+ "mean_weighted_f1": round(avg_f1, 4),
130
+ "num_keywords": len(self.disambiguation_metrics),
131
+ }
132
+
133
+ if self.similarity_distribution:
134
+ summary["similarity_distribution"] = self.similarity_distribution
135
+
136
+ return summary
137
+
138
+ def to_json(self, indent: int = 2) -> str:
139
+ """Serialize the full report to JSON."""
140
+ return json.dumps(asdict(self), indent=indent, default=str)
141
+
142
+ def save(self, path: str) -> None:
143
+ """Save the report to a JSON file."""
144
+ Path(path).write_text(self.to_json())
145
+ logger.info(f"Evaluation report saved to {path}")
146
+
147
+
148
+ # ------------------------------------------------------------------ #
149
+ # Evaluator
150
+ # ------------------------------------------------------------------ #
151
+
152
+ class Evaluator:
153
+ """
154
+ Evaluation pipeline for the ContextualSimilarityEngine.
155
+
156
+ Usage:
157
+ engine = ContextualSimilarityEngine()
158
+ engine.add_document("doc1", text)
159
+ engine.build_index()
160
+
161
+ evaluator = Evaluator(engine)
162
+
163
+ # Evaluate retrieval quality
164
+ evaluator.evaluate_retrieval(queries_with_relevance)
165
+
166
+ # Evaluate keyword disambiguation
167
+ evaluator.evaluate_disambiguation(ground_truth, candidate_meanings)
168
+
169
+ # Evaluate clustering
170
+ evaluator.evaluate_clustering(ground_truth)
171
+
172
+ # Get full report
173
+ report = evaluator.get_report()
174
+ """
175
+
176
+ def __init__(self, engine: ContextualSimilarityEngine):
177
+ self.engine = engine
178
+ self._report = EvaluationReport(
179
+ timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
180
+ model_name=engine._model_name,
181
+ corpus_stats=engine.get_stats(),
182
+ )
183
+
184
+ # ------------------------------------------------------------------ #
185
+ # Retrieval evaluation
186
+ # ------------------------------------------------------------------ #
187
+
188
+ def evaluate_retrieval(
189
+ self,
190
+ queries: list[dict],
191
+ k_values: list[int] = None,
192
+ ) -> list[RetrievalMetrics]:
193
+ """
194
+ Evaluate retrieval quality given labeled queries.
195
+
196
+ Args:
197
+ queries: List of dicts with keys:
198
+ - "query": str, the query text
199
+ - "relevant_doc_ids": list[str], doc IDs that are relevant
200
+ OR
201
+ - "relevant_texts": list[str], text snippets considered relevant
202
+ k_values: List of K values for P@K, R@K, NDCG@K.
203
+
204
+ Returns:
205
+ List of RetrievalMetrics, one per query.
206
+ """
207
+ if k_values is None:
208
+ k_values = [1, 3, 5, 10]
209
+
210
+ t0 = time.time()
211
+ all_metrics = []
212
+
213
+ for q in queries:
214
+ query_text = q["query"]
215
+ max_k = max(k_values)
216
+ results = self.engine.query(query_text, top_k=max_k)
217
+
218
+ # Determine relevance for each result
219
+ relevant_doc_ids = set(q.get("relevant_doc_ids", []))
220
+ relevant_texts = set(q.get("relevant_texts", []))
221
+
222
+ def is_relevant(result):
223
+ if relevant_doc_ids and result.chunk.doc_id in relevant_doc_ids:
224
+ return True
225
+ if relevant_texts:
226
+ return any(rt.lower() in result.chunk.text.lower() for rt in relevant_texts)
227
+ return False
228
+
229
+ relevance = [is_relevant(r) for r in results]
230
+ scores = [r.score for r in results]
231
+
232
+ metrics = RetrievalMetrics(query=query_text)
233
+
234
+ # P@K and R@K
235
+ total_relevant = sum(relevance)
236
+ for k in k_values:
237
+ top_k_rel = relevance[:k]
238
+ metrics.precision_at_k[k] = sum(top_k_rel) / k if k > 0 else 0
239
+ metrics.recall_at_k[k] = (
240
+ sum(top_k_rel) / total_relevant if total_relevant > 0 else 0
241
+ )
242
+ metrics.ndcg_at_k[k] = self._compute_ndcg(relevance[:k], k)
243
+
244
+ # MRR
245
+ for i, rel in enumerate(relevance):
246
+ if rel:
247
+ metrics.mrr = 1.0 / (i + 1)
248
+ break
249
+
250
+ metrics.avg_similarity = float(np.mean(scores)) if scores else 0.0
251
+ metrics.top_score = float(scores[0]) if scores else 0.0
252
+
253
+ all_metrics.append(metrics)
254
+
255
+ elapsed = time.time() - t0
256
+ self._report.retrieval_metrics = all_metrics
257
+ self._report.timing["retrieval_eval_seconds"] = round(elapsed, 3)
258
+ return all_metrics
259
+
260
+ @staticmethod
261
+ def _compute_ndcg(relevance: list[bool], k: int) -> float:
262
+ """Compute NDCG@K for binary relevance."""
263
+ dcg = sum(
264
+ (1 if rel else 0) / np.log2(i + 2)
265
+ for i, rel in enumerate(relevance[:k])
266
+ )
267
+ # Ideal: all relevant items first
268
+ ideal = sorted(relevance[:k], reverse=True)
269
+ idcg = sum(
270
+ (1 if rel else 0) / np.log2(i + 2)
271
+ for i, rel in enumerate(ideal)
272
+ )
273
+ return dcg / idcg if idcg > 0 else 0.0
274
+
275
+ # ------------------------------------------------------------------ #
276
+ # Clustering evaluation
277
+ # ------------------------------------------------------------------ #
278
+
279
+ def evaluate_clustering(
280
+ self,
281
+ ground_truth: list[GroundTruthEntry],
282
+ cluster_threshold: float = 0.35,
283
+ ) -> list[ClusteringMetrics]:
284
+ """
285
+ Evaluate clustering quality by comparing engine's auto-clusters
286
+ against ground truth meaning labels.
287
+
288
+ Args:
289
+ ground_truth: Labeled entries with keyword, text, and true_meaning.
290
+ cluster_threshold: Threshold for agglomerative clustering.
291
+
292
+ Returns:
293
+ List of ClusteringMetrics, one per keyword.
294
+ """
295
+ t0 = time.time()
296
+
297
+ # Group ground truth by keyword
298
+ by_keyword: dict[str, list[GroundTruthEntry]] = {}
299
+ for entry in ground_truth:
300
+ by_keyword.setdefault(entry.keyword, []).append(entry)
301
+
302
+ all_metrics = []
303
+ for keyword, entries in by_keyword.items():
304
+ analysis = self.engine.analyze_keyword(
305
+ keyword, cluster_threshold=cluster_threshold
306
+ )
307
+
308
+ if not analysis.meaning_clusters:
309
+ all_metrics.append(ClusteringMetrics(keyword=keyword))
310
+ continue
311
+
312
+ # Map ground truth entries to predicted clusters
313
+ true_labels = []
314
+ pred_labels = []
315
+ meaning_to_id = {}
316
+
317
+ for entry in entries:
318
+ # Assign numeric ID to each true meaning
319
+ if entry.true_meaning not in meaning_to_id:
320
+ meaning_to_id[entry.true_meaning] = len(meaning_to_id)
321
+ true_labels.append(meaning_to_id[entry.true_meaning])
322
+
323
+ # Find which cluster this entry's text belongs to
324
+ best_cluster = -1
325
+ best_sim = -1
326
+ entry_vec = self.engine.model.encode(
327
+ [entry.text], normalize_embeddings=True, convert_to_numpy=True
328
+ )
329
+ for cluster in analysis.meaning_clusters:
330
+ for ctx in cluster["contexts"]:
331
+ idx = self.engine.chunks.index(ctx.chunk)
332
+ sim = float(np.dot(entry_vec[0], self.engine.embeddings[idx]))
333
+ if sim > best_sim:
334
+ best_sim = sim
335
+ best_cluster = cluster["cluster_id"]
336
+ pred_labels.append(best_cluster)
337
+
338
+ metrics = ClusteringMetrics(
339
+ keyword=keyword,
340
+ nmi=normalized_mutual_info_score(true_labels, pred_labels),
341
+ ari=adjusted_rand_score(true_labels, pred_labels),
342
+ num_predicted_clusters=len(analysis.meaning_clusters),
343
+ num_true_clusters=len(meaning_to_id),
344
+ cluster_sizes=[c["size"] for c in analysis.meaning_clusters],
345
+ )
346
+ all_metrics.append(metrics)
347
+
348
+ elapsed = time.time() - t0
349
+ self._report.clustering_metrics = all_metrics
350
+ self._report.timing["clustering_eval_seconds"] = round(elapsed, 3)
351
+ return all_metrics
352
+
353
+ # ------------------------------------------------------------------ #
354
+ # Disambiguation evaluation
355
+ # ------------------------------------------------------------------ #
356
+
357
+ def evaluate_disambiguation(
358
+ self,
359
+ ground_truth: list[GroundTruthEntry],
360
+ candidate_meanings: dict[str, list[str]],
361
+ ) -> list[DisambiguationMetrics]:
362
+ """
363
+ Evaluate keyword meaning disambiguation accuracy.
364
+
365
+ For each ground truth entry, uses match_keyword_to_meaning() and compares
366
+ the predicted best match against the true label.
367
+
368
+ Args:
369
+ ground_truth: Labeled entries with keyword, text, and true_meaning.
370
+ candidate_meanings: Dict mapping keyword -> list of candidate meaning strings.
371
+ Each candidate should be a descriptive phrase, e.g. {"pizza": ["food", "school"]}.
372
+
373
+ Returns:
374
+ List of DisambiguationMetrics, one per keyword.
375
+ """
376
+ t0 = time.time()
377
+
378
+ by_keyword: dict[str, list[GroundTruthEntry]] = {}
379
+ for entry in ground_truth:
380
+ by_keyword.setdefault(entry.keyword, []).append(entry)
381
+
382
+ all_metrics = []
383
+ for keyword, entries in by_keyword.items():
384
+ candidates = candidate_meanings.get(keyword, [])
385
+ if not candidates:
386
+ logger.warning(f"No candidate meanings for '{keyword}', skipping.")
387
+ continue
388
+
389
+ true_labels = []
390
+ pred_labels = []
391
+
392
+ for entry in entries:
393
+ # Encode the entry text and score against each candidate
394
+ entry_vec = self.engine.model.encode(
395
+ [entry.text], normalize_embeddings=True, convert_to_tensor=True
396
+ )
397
+ cand_vecs = self.engine.model.encode(
398
+ candidates, normalize_embeddings=True, convert_to_tensor=True
399
+ )
400
+ from sentence_transformers import util as st_util
401
+ scores = st_util.pytorch_cos_sim(entry_vec, cand_vecs)[0]
402
+ best_idx = int(scores.argmax())
403
+ predicted = candidates[best_idx]
404
+
405
+ true_labels.append(entry.true_meaning)
406
+ pred_labels.append(predicted)
407
+
408
+ # Compute metrics
409
+ unique_labels = sorted(set(true_labels + pred_labels))
410
+ accuracy = sum(t == p for t, p in zip(true_labels, pred_labels)) / len(true_labels)
411
+
412
+ # Per-meaning precision, recall, F1
413
+ per_meaning_p = {}
414
+ per_meaning_r = {}
415
+ per_meaning_f = {}
416
+ for label in unique_labels:
417
+ t_binary = [1 if t == label else 0 for t in true_labels]
418
+ p_binary = [1 if p == label else 0 for p in pred_labels]
419
+ p_val = precision_score(t_binary, p_binary, zero_division=0)
420
+ r_val = recall_score(t_binary, p_binary, zero_division=0)
421
+ f_val = f1_score(t_binary, p_binary, zero_division=0)
422
+ per_meaning_p[label] = round(p_val, 4)
423
+ per_meaning_r[label] = round(r_val, 4)
424
+ per_meaning_f[label] = round(f_val, 4)
425
+
426
+ weighted_f = f1_score(
427
+ true_labels, pred_labels, average="weighted", zero_division=0
428
+ )
429
+
430
+ cm = confusion_matrix(true_labels, pred_labels, labels=unique_labels)
431
+
432
+ metrics = DisambiguationMetrics(
433
+ keyword=keyword,
434
+ accuracy=round(accuracy, 4),
435
+ weighted_f1=round(weighted_f, 4),
436
+ per_meaning_precision=per_meaning_p,
437
+ per_meaning_recall=per_meaning_r,
438
+ per_meaning_f1=per_meaning_f,
439
+ confusion=cm.tolist(),
440
+ total_samples=len(entries),
441
+ )
442
+ all_metrics.append(metrics)
443
+
444
+ elapsed = time.time() - t0
445
+ self._report.disambiguation_metrics = all_metrics
446
+ self._report.timing["disambiguation_eval_seconds"] = round(elapsed, 3)
447
+ return all_metrics
448
+
449
+ # ------------------------------------------------------------------ #
450
+ # Similarity distribution analysis
451
+ # ------------------------------------------------------------------ #
452
+
453
+ def analyze_similarity_distribution(
454
+ self, sample_size: int = 1000, seed: int = 42
455
+ ) -> dict:
456
+ """
457
+ Analyze the distribution of pairwise similarities in the corpus.
458
+ Useful for calibrating thresholds and understanding embedding space.
459
+
460
+ Returns:
461
+ Dict with mean, std, percentiles, and histogram data.
462
+ """
463
+ self.engine._ensure_index()
464
+ n = len(self.engine.chunks)
465
+ rng = np.random.RandomState(seed)
466
+
467
+ # Sample random pairs
468
+ actual_sample = min(sample_size, n * (n - 1) // 2)
469
+ pairs_i = rng.randint(0, n, size=actual_sample)
470
+ pairs_j = rng.randint(0, n, size=actual_sample)
471
+ # Avoid self-pairs
472
+ mask = pairs_i != pairs_j
473
+ pairs_i, pairs_j = pairs_i[mask], pairs_j[mask]
474
+
475
+ sims = np.sum(
476
+ self.engine.embeddings[pairs_i] * self.engine.embeddings[pairs_j], axis=1
477
+ )
478
+
479
+ percentiles = {
480
+ str(p): round(float(np.percentile(sims, p)), 4)
481
+ for p in [5, 10, 25, 50, 75, 90, 95]
482
+ }
483
+
484
+ # Histogram
485
+ hist, bin_edges = np.histogram(sims, bins=20, range=(-1, 1))
486
+ histogram = [
487
+ {"bin_start": round(float(bin_edges[i]), 3), "bin_end": round(float(bin_edges[i + 1]), 3), "count": int(hist[i])}
488
+ for i in range(len(hist))
489
+ ]
490
+
491
+ dist_info = {
492
+ "sample_size": int(len(sims)),
493
+ "mean": round(float(np.mean(sims)), 4),
494
+ "std": round(float(np.std(sims)), 4),
495
+ "min": round(float(np.min(sims)), 4),
496
+ "max": round(float(np.max(sims)), 4),
497
+ "percentiles": percentiles,
498
+ "histogram": histogram,
499
+ }
500
+
501
+ self._report.similarity_distribution = dist_info
502
+ return dist_info
503
+
504
+ # ------------------------------------------------------------------ #
505
+ # Full evaluation
506
+ # ------------------------------------------------------------------ #
507
+
508
+ def run_full_evaluation(
509
+ self,
510
+ ground_truth: Optional[list[GroundTruthEntry]] = None,
511
+ candidate_meanings: Optional[dict[str, list[str]]] = None,
512
+ retrieval_queries: Optional[list[dict]] = None,
513
+ cluster_threshold: float = 0.35,
514
+ ) -> EvaluationReport:
515
+ """
516
+ Run the complete evaluation pipeline.
517
+
518
+ Args:
519
+ ground_truth: Labeled data for clustering and disambiguation eval.
520
+ candidate_meanings: Keyword -> candidate meanings for disambiguation.
521
+ retrieval_queries: Labeled queries for retrieval eval.
522
+ cluster_threshold: Clustering distance threshold.
523
+
524
+ Returns:
525
+ Full EvaluationReport.
526
+ """
527
+ logger.info("Running full evaluation pipeline...")
528
+ t0 = time.time()
529
+
530
+ # Always compute similarity distribution
531
+ self.analyze_similarity_distribution()
532
+
533
+ if retrieval_queries:
534
+ self.evaluate_retrieval(retrieval_queries)
535
+
536
+ if ground_truth:
537
+ self.evaluate_clustering(ground_truth, cluster_threshold)
538
+ if candidate_meanings:
539
+ self.evaluate_disambiguation(ground_truth, candidate_meanings)
540
+
541
+ self._report.timing["total_eval_seconds"] = round(time.time() - t0, 3)
542
+ logger.info("Evaluation complete.")
543
+ return self._report
544
+
545
+ def get_report(self) -> EvaluationReport:
546
+ """Return the current evaluation report."""
547
+ return self._report
frontend/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
frontend/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # React + Vite
2
+
3
+ This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
4
+
5
+ Currently, two official plugins are available:
6
+
7
+ - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) (or [oxc](https://oxc.rs) when used in [rolldown-vite](https://vite.dev/guide/rolldown)) for Fast Refresh
8
+ - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
9
+
10
+ ## React Compiler
11
+
12
+ The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
13
+
14
+ ## Expanding the ESLint configuration
15
+
16
+ If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.
frontend/eslint.config.js ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from '@eslint/js'
2
+ import globals from 'globals'
3
+ import reactHooks from 'eslint-plugin-react-hooks'
4
+ import reactRefresh from 'eslint-plugin-react-refresh'
5
+ import { defineConfig, globalIgnores } from 'eslint/config'
6
+
7
+ export default defineConfig([
8
+ globalIgnores(['dist']),
9
+ {
10
+ files: ['**/*.{js,jsx}'],
11
+ extends: [
12
+ js.configs.recommended,
13
+ reactHooks.configs.flat.recommended,
14
+ reactRefresh.configs.vite,
15
+ ],
16
+ languageOptions: {
17
+ ecmaVersion: 2020,
18
+ globals: globals.browser,
19
+ parserOptions: {
20
+ ecmaVersion: 'latest',
21
+ ecmaFeatures: { jsx: true },
22
+ sourceType: 'module',
23
+ },
24
+ },
25
+ rules: {
26
+ 'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],
27
+ },
28
+ },
29
+ ])
frontend/index.html ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Contextual Similarity Engine</title>
7
+ </head>
8
+ <body>
9
+ <div id="root"></div>
10
+ <script type="module" src="/src/main.tsx"></script>
11
+ </body>
12
+ </html>
frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
frontend/package.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "contextual-similarity-ui",
3
+ "private": true,
4
+ "version": "1.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "tsc -b && vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview"
11
+ },
12
+ "dependencies": {
13
+ "axios": "^1.13.6",
14
+ "react": "^19.2.4",
15
+ "react-dom": "^19.2.4",
16
+ "recharts": "^3.8.0"
17
+ },
18
+ "devDependencies": {
19
+ "@eslint/js": "^9.39.4",
20
+ "@types/react": "^19.2.14",
21
+ "@types/react-dom": "^19.2.3",
22
+ "@vitejs/plugin-react": "^5.1.4",
23
+ "eslint": "^9.39.4",
24
+ "eslint-plugin-react-hooks": "^7.0.1",
25
+ "eslint-plugin-react-refresh": "^0.5.2",
26
+ "globals": "^17.4.0",
27
+ "typescript": "~5.9.3",
28
+ "vite": "^7.3.1"
29
+ }
30
+ }
frontend/public/vite.svg ADDED
frontend/src/App.tsx ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useEffect, Fragment } from "react";
2
+ import type { CorpusStats } from "./types";
3
+ import { api, checkConnection } from "./api";
4
+ import TrainingPanel from "./components/TrainingPanel";
5
+ import EngineSetup from "./components/EngineSetup";
6
+ import SemanticSearch from "./components/SemanticSearch";
7
+ import TextCompare from "./components/TextCompare";
8
+ import KeywordAnalysis from "./components/KeywordAnalysis";
9
+ import KeywordMatcher from "./components/KeywordMatcher";
10
+ import BatchAnalysis from "./components/BatchAnalysis";
11
+ import SimilarWords from "./components/SimilarWords";
12
+ import ContextAnalysis from "./components/ContextAnalysis";
13
+ import EvaluationDashboard from "./components/EvaluationDashboard";
14
+ import Word2VecPanel from "./components/Word2VecPanel";
15
+ import DatasetPanel from "./components/DatasetPanel";
16
+ import "./styles.css";
17
+
18
+ type NavGroup = "data" | "training" | "analysis" | "evaluation";
19
+ type TrainingTab = "model" | "w2v";
20
+ type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
21
+
22
+ const STEPS: { id: NavGroup; label: string; needsIndex?: boolean }[] = [
23
+ { id: "data", label: "Data & Setup" },
24
+ { id: "training", label: "Training" },
25
+ { id: "analysis", label: "Analysis", needsIndex: true },
26
+ { id: "evaluation", label: "Evaluation", needsIndex: true },
27
+ ];
28
+
29
+ const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
30
+ { id: "model", label: "Fine-tune Model" },
31
+ { id: "w2v", label: "Word2Vec Baseline" },
32
+ ];
33
+
34
+ const ANALYSIS_TABS: { id: AnalysisTab; label: string }[] = [
35
+ { id: "context", label: "Context" },
36
+ { id: "words", label: "Similar Words" },
37
+ { id: "search", label: "Search" },
38
+ { id: "compare", label: "Compare" },
39
+ { id: "keyword", label: "Keywords" },
40
+ { id: "match", label: "Matcher" },
41
+ { id: "batch", label: "Batch" },
42
+ ];
43
+
44
+ export default function App() {
45
+ const [group, setGroup] = useState<NavGroup>("data");
46
+ const [trainingTab, setTrainingTab] = useState<TrainingTab>("model");
47
+ const [analysisTab, setAnalysisTab] = useState<AnalysisTab>("context");
48
+ const [stats, setStats] = useState<CorpusStats | null>(null);
49
+ const [showManualSetup, setShowManualSetup] = useState(false);
50
+ const [serverError, setServerError] = useState<string | null>(null);
51
+ const ready = stats !== null && stats.index_built;
52
+
53
+ useEffect(() => {
54
+ checkConnection().then((err) => {
55
+ setServerError(err);
56
+ // If server is up, try to fetch stats (engine may have been auto-restored)
57
+ if (!err) {
58
+ api.getStats().then(setStats).catch(() => {});
59
+ }
60
+ });
61
+ const interval = setInterval(() => {
62
+ checkConnection().then(setServerError);
63
+ }, 15000);
64
+ return () => clearInterval(interval);
65
+ }, []);
66
+
67
+ function handleStepClick(id: NavGroup, needsIndex?: boolean) {
68
+ if (needsIndex && !ready) return;
69
+ setGroup(id);
70
+ }
71
+
72
+ return (
73
+ <div className="app">
74
+ <header className="app-header">
75
+ <h1>Contextual Similarity Engine</h1>
76
+ {stats && (
77
+ <div className="header-stats">
78
+ <span className="badge">{stats.model_name}</span>
79
+ <span className="badge">{stats.total_documents} docs</span>
80
+ <span className="badge">{stats.total_chunks} chunks</span>
81
+ <span className={`badge ${stats.index_built ? "badge-ok" : "badge-warn"}`}>
82
+ {stats.index_built ? "Index ready" : "Index not built"}
83
+ </span>
84
+ </div>
85
+ )}
86
+ </header>
87
+
88
+ {serverError && (
89
+ <div className="server-error-banner">
90
+ <strong>Server unavailable:</strong> {serverError}
91
+ </div>
92
+ )}
93
+
94
+ {/* Progress Stepper (serves as main navigation) */}
95
+ <nav className="stepper">
96
+ {STEPS.map((step, i) => {
97
+ const disabled = step.needsIndex && !ready;
98
+ const active = group === step.id;
99
+ const done = step.id === "data" && ready;
100
+ return (
101
+ <Fragment key={step.id}>
102
+ {i > 0 && (
103
+ <div className={`stepper-line ${!disabled ? "stepper-line-active" : ""}`} />
104
+ )}
105
+ <div className="stepper-item">
106
+ <button
107
+ className={`stepper-circle ${active ? "stepper-active" : ""} ${done && !active ? "stepper-done" : ""}`}
108
+ onClick={() => handleStepClick(step.id, step.needsIndex)}
109
+ disabled={disabled}
110
+ >
111
+ {done && !active ? "\u2713" : i + 1}
112
+ </button>
113
+ <span className={`stepper-label ${active ? "stepper-label-active" : ""}`}>
114
+ {step.label}
115
+ </span>
116
+ </div>
117
+ </Fragment>
118
+ );
119
+ })}
120
+ </nav>
121
+
122
+ {/* Sub-tabs for groups with multiple views */}
123
+ {group === "training" && (
124
+ <nav className="subtabs">
125
+ {TRAINING_TABS.map((t) => (
126
+ <button
127
+ key={t.id}
128
+ className={`subtab ${trainingTab === t.id ? "subtab-active" : ""}`}
129
+ onClick={() => setTrainingTab(t.id)}
130
+ >
131
+ {t.label}
132
+ </button>
133
+ ))}
134
+ </nav>
135
+ )}
136
+
137
+ {group === "analysis" && (
138
+ <nav className="subtabs">
139
+ {ANALYSIS_TABS.map((t) => (
140
+ <button
141
+ key={t.id}
142
+ className={`subtab ${analysisTab === t.id ? "subtab-active" : ""}`}
143
+ onClick={() => setAnalysisTab(t.id)}
144
+ >
145
+ {t.label}
146
+ </button>
147
+ ))}
148
+ </nav>
149
+ )}
150
+
151
+ {/* Content */}
152
+ <main className="content">
153
+ {group === "data" && (
154
+ <>
155
+ <DatasetPanel onStatsUpdate={setStats} />
156
+ <button
157
+ className="collapsible-toggle"
158
+ onClick={() => setShowManualSetup(!showManualSetup)}
159
+ >
160
+ <span className="collapsible-arrow">{showManualSetup ? "\u25be" : "\u25b8"}</span>
161
+ Or add documents manually
162
+ </button>
163
+ {showManualSetup && <EngineSetup onStatsUpdate={setStats} />}
164
+ </>
165
+ )}
166
+
167
+ {group === "training" && trainingTab === "model" && <TrainingPanel />}
168
+ {group === "training" && trainingTab === "w2v" && <Word2VecPanel />}
169
+
170
+ {group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
171
+ {group === "analysis" && analysisTab === "words" && <SimilarWords />}
172
+ {group === "analysis" && analysisTab === "search" && <SemanticSearch />}
173
+ {group === "analysis" && analysisTab === "compare" && <TextCompare />}
174
+ {group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
175
+ {group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
176
+ {group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
177
+
178
+ {group === "evaluation" && <EvaluationDashboard />}
179
+ </main>
180
+ </div>
181
+ );
182
+ }
frontend/src/api.ts ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import axios from "axios";
2
+ import type {
3
+ InitRequest, InitResponse, DocumentRequest, AddDocResponse, BuildIndexResponse,
4
+ QueryRequest, QueryResponse, CompareRequest, CompareResponse,
5
+ KeywordAnalysisRequest, KeywordAnalysisResponse,
6
+ KeywordMatchRequest, MatchResponse, BatchAnalysisRequest,
7
+ CorpusStats, SimilarityDistribution, DisambiguationMetric, RetrievalMetric,
8
+ TrainResponse, TrainEvalResponse,
9
+ W2VInitResponse, W2VQueryResult, W2VSimilarWord,
10
+ DatasetInfo, DatasetLoadRequest, DatasetLoadResponse, DatasetPreviewResponse,
11
+ ContextAnalysisResponse,
12
+ } from "./types";
13
+
14
+ const client = axios.create({ baseURL: "/api" });
15
+ const long = { timeout: 600000 };
16
+
17
+ /** Extract a human-readable error message from an Axios error. */
18
+ export function getErrorMessage(err: unknown): string {
19
+ if (axios.isAxiosError(err)) {
20
+ if (err.code === "ECONNABORTED") return "Request timed out. The server may be busy.";
21
+ if (!err.response) return "Cannot connect to server. Is it running? (uv run python server.py)";
22
+ const detail = err.response.data?.detail;
23
+ if (typeof detail === "string") return detail;
24
+ if (typeof err.response.data === "string") return err.response.data;
25
+ return `Server error (${err.response.status})`;
26
+ }
27
+ if (err instanceof Error) return err.message;
28
+ return "An unexpected error occurred.";
29
+ }
30
+
31
+ /** Check if the backend is reachable. Returns null on success or an error message. */
32
+ export async function checkConnection(): Promise<string | null> {
33
+ try {
34
+ await client.get("/stats", { timeout: 5000 });
35
+ return null;
36
+ } catch (err) {
37
+ if (axios.isAxiosError(err) && err.response?.status === 400) {
38
+ // 400 = "Engine not initialized" — server is up, just no engine yet
39
+ return null;
40
+ }
41
+ return getErrorMessage(err);
42
+ }
43
+ }
44
+
45
+ /** Shared shape for all training requests (matches server TrainRequest). */
46
+ interface TrainRequestData {
47
+ corpus_texts: string[];
48
+ base_model: string;
49
+ output_path: string;
50
+ epochs: number;
51
+ batch_size: number;
52
+ }
53
+
54
+ export const api = {
55
+ // ---- Training ----
56
+ trainUnsupervised: (data: TrainRequestData) =>
57
+ client.post<TrainResponse>("/train/unsupervised", data, long).then(r => r.data),
58
+
59
+ trainContrastive: (data: TrainRequestData) =>
60
+ client.post<TrainResponse>("/train/contrastive", data, long).then(r => r.data),
61
+
62
+ trainKeywords: (data: TrainRequestData & { keyword_meanings: Record<string, string> }) =>
63
+ client.post<TrainResponse>("/train/keywords", data, long).then(r => r.data),
64
+
65
+ trainEvaluate: (data: { test_pairs: { text_a: string; text_b: string; expected: number }[]; trained_model_path: string; base_model: string; corpus_texts: string[] }) =>
66
+ client.post<TrainEvalResponse>("/train/evaluate", data).then(r => r.data),
67
+
68
+ // ---- Engine ----
69
+ init: (data: InitRequest) =>
70
+ client.post<InitResponse>("/init", data).then(r => r.data),
71
+
72
+ addDocument: (data: DocumentRequest) =>
73
+ client.post<AddDocResponse>("/documents", data).then(r => r.data),
74
+
75
+ buildIndex: () =>
76
+ client.post<BuildIndexResponse>("/index/build").then(r => r.data),
77
+
78
+ query: (data: QueryRequest) =>
79
+ client.post<QueryResponse>("/query", data).then(r => r.data),
80
+
81
+ compare: (data: CompareRequest) =>
82
+ client.post<CompareResponse>("/compare", data).then(r => r.data),
83
+
84
+ analyzeKeyword: (data: KeywordAnalysisRequest) =>
85
+ client.post<KeywordAnalysisResponse>("/analyze/keyword", data).then(r => r.data),
86
+
87
+ batchAnalyze: (data: BatchAnalysisRequest) =>
88
+ client.post<Record<string, KeywordAnalysisResponse>>("/analyze/batch", data).then(r => r.data),
89
+
90
+ matchKeyword: (data: KeywordMatchRequest) =>
91
+ client.post<MatchResponse>("/match", data).then(r => r.data),
92
+
93
+ analyzeContext: (data: { keyword: string; cluster_threshold?: number; top_words?: number }) =>
94
+ client.post<ContextAnalysisResponse>("/analyze/context", data).then(r => r.data),
95
+
96
+ similarWords: (data: { word: string; top_k: number }) =>
97
+ client.post<{ word: string; similar: { word: string; score: number }[] }>("/analyze/similar-words", data).then(r => r.data),
98
+
99
+ getStats: () =>
100
+ client.get<CorpusStats>("/stats").then(r => r.data),
101
+
102
+ getCorpusTexts: (maxDocs: number = 500) =>
103
+ client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
104
+
105
+ // ---- Engine persistence ----
106
+ saveEngine: () =>
107
+ client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
108
+
109
+ hasSavedState: () =>
110
+ client.get<{ exists: boolean }>("/engine/has-saved-state").then(r => r.data),
111
+
112
+ // ---- Evaluation ----
113
+ getSimilarityDistribution: () =>
114
+ client.get<SimilarityDistribution>("/eval/similarity-distribution").then(r => r.data),
115
+
116
+ evalDisambiguation: (data: { ground_truth: { keyword: string; text: string; true_meaning: string }[]; candidate_meanings: Record<string, string[]> }) =>
117
+ client.post<{ metrics: DisambiguationMetric[] }>("/eval/disambiguation", data).then(r => r.data),
118
+
119
+ evalRetrieval: (data: { queries: { query: string; relevant_doc_ids?: string[]; relevant_texts?: string[] }[]; k_values: number[] }) =>
120
+ client.post<{ metrics: RetrievalMetric[] }>("/eval/retrieval", data).then(r => r.data),
121
+
122
+ // ---- Word2Vec ----
123
+ w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
124
+ client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
125
+
126
+ w2vCompare: (data: { text_a: string; text_b: string }) =>
127
+ client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
128
+
129
+ w2vQuery: (data: { text: string; top_k: number }) =>
130
+ client.post<{ query: string; results: W2VQueryResult[] }>("/w2v/query", data).then(r => r.data),
131
+
132
+ w2vSimilarWords: (data: { word: string; top_k: number }) =>
133
+ client.post<{ word: string; similar: W2VSimilarWord[] }>("/w2v/similar-words", data).then(r => r.data),
134
+
135
+ // ---- Dataset (HuggingFace) ----
136
+ datasetInfo: () =>
137
+ client.get<DatasetInfo>("/dataset/info").then(r => r.data),
138
+
139
+ datasetLoad: (data: DatasetLoadRequest) =>
140
+ client.post<DatasetLoadResponse>("/dataset/load", data, long).then(r => r.data),
141
+
142
+ datasetPreview: (maxDocs: number = 10, sourceFilter?: string) =>
143
+ client.post<DatasetPreviewResponse>(`/dataset/preview?max_docs=${maxDocs}${sourceFilter ? `&source_filter=${sourceFilter}` : ""}`).then(r => r.data),
144
+ };
frontend/src/assets/react.svg ADDED
frontend/src/components/BatchAnalysis.tsx ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api } from "../api";
3
+ import type { KeywordAnalysisResponse } from "../types";
4
+ import { useApiCall } from "../hooks/useApiCall";
5
+ import ScoreBar from "./ScoreBar";
6
+ import StatusMessage from "./StatusMessage";
7
+
8
+ export default function BatchAnalysis() {
9
+ const [keywordsText, setKeywordsText] = useState("");
10
+ const [topK, setTopK] = useState(5);
11
+ const [threshold, setThreshold] = useState(0.4);
12
+ const { data: results, loading, error, run } = useApiCall<Record<string, KeywordAnalysisResponse>>();
13
+
14
+ async function handleAnalyze() {
15
+ const keywords = keywordsText.split("\n").map((s) => s.trim()).filter(Boolean);
16
+ if (keywords.length === 0) return;
17
+ await run(() => api.batchAnalyze({ keywords, top_k: topK, cluster_threshold: threshold, compare_across: true }));
18
+ }
19
+
20
+ return (
21
+ <div>
22
+ <div className="panel">
23
+ <h2>Batch Keyword Analysis</h2>
24
+ <p className="panel-desc">
25
+ Analyze multiple keywords at once and compare their semantic relationships.
26
+ </p>
27
+ <div className="form-row">
28
+ <div className="form-group">
29
+ <label>Keywords (one per line)</label>
30
+ <textarea
31
+ value={keywordsText}
32
+ onChange={(e) => setKeywordsText(e.target.value)}
33
+ placeholder={`pizza\nschool\nhomework`}
34
+ rows={4}
35
+ />
36
+ </div>
37
+ <div className="flex-col gap-1">
38
+ <div className="form-group form-group-sm">
39
+ <label>Top K</label>
40
+ <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
41
+ </div>
42
+ <div className="form-group form-group-md">
43
+ <label>Cluster Threshold</label>
44
+ <input type="number" value={threshold} onChange={(e) => setThreshold(+e.target.value)} min={0.1} max={1} step={0.05} />
45
+ </div>
46
+ </div>
47
+ </div>
48
+ <button className="btn btn-primary" onClick={handleAnalyze} disabled={loading || !keywordsText.trim()}>
49
+ {loading ? "Analyzing..." : "Analyze All"}
50
+ </button>
51
+ </div>
52
+
53
+ {error && <StatusMessage type="err" message={error} />}
54
+
55
+ {results && (
56
+ <>
57
+ {Object.values(results).some((a) => Object.keys(a.cross_keyword_similarities).length > 0) && (
58
+ <div className="panel">
59
+ <h3>Cross-Keyword Similarity</h3>
60
+ <table className="data-table">
61
+ <thead>
62
+ <tr>
63
+ <th>Keyword</th>
64
+ {Object.keys(results).map((kw) => (
65
+ <th key={kw}>{kw}</th>
66
+ ))}
67
+ </tr>
68
+ </thead>
69
+ <tbody>
70
+ {Object.entries(results).map(([kw, analysis]) => (
71
+ <tr key={kw}>
72
+ <td style={{ fontWeight: 600 }}>{kw}</td>
73
+ {Object.keys(results).map((other) => (
74
+ <td key={other}>
75
+ {kw === other ? (
76
+ <span className="text-dim">-</span>
77
+ ) : (
78
+ <ScoreBar score={analysis.cross_keyword_similarities[other] ?? 0} />
79
+ )}
80
+ </td>
81
+ ))}
82
+ </tr>
83
+ ))}
84
+ </tbody>
85
+ </table>
86
+ </div>
87
+ )}
88
+
89
+ {Object.entries(results).map(([kw, analysis]) => (
90
+ <div key={kw} className="panel">
91
+ <h3>
92
+ "{kw}" &mdash; {analysis.total_occurrences} occurrence(s),{" "}
93
+ {analysis.meaning_clusters.length} cluster(s)
94
+ </h3>
95
+ {analysis.meaning_clusters.map((cluster) => (
96
+ <div key={cluster.cluster_id} className="result-card mt-1">
97
+ <div className="result-header">
98
+ <strong>Cluster {cluster.cluster_id}</strong>
99
+ <span className="tag">{cluster.size} occurrence(s)</span>
100
+ </div>
101
+ <div className="result-text">{cluster.representative_text.slice(0, 200)}...</div>
102
+ </div>
103
+ ))}
104
+ </div>
105
+ ))}
106
+ </>
107
+ )}
108
+ </div>
109
+ );
110
+ }
frontend/src/components/ContextAnalysis.tsx ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api } from "../api";
3
+ import type { ContextAnalysisResponse } from "../types";
4
+ import { useApiCall } from "../hooks/useApiCall";
5
+ import StatusMessage from "./StatusMessage";
6
+
7
+ export default function ContextAnalysis() {
8
+ const [keyword, setKeyword] = useState("");
9
+ const { data: result, loading, error, run } = useApiCall<ContextAnalysisResponse>();
10
+
11
+ async function handleAnalyze() {
12
+ if (!keyword.trim()) return;
13
+ await run(() => api.analyzeContext({ keyword: keyword.trim() }));
14
+ }
15
+
16
+ return (
17
+ <div>
18
+ <div className="panel">
19
+ <h2>Context Analysis</h2>
20
+ <p className="panel-desc">
21
+ Enter a keyword to discover what it likely means based on how it's used in the corpus.
22
+ The engine clusters all occurrences and extracts the most associated words for each meaning.
23
+ </p>
24
+ <div className="flex-row" style={{ alignItems: "flex-end" }}>
25
+ <div className="form-group form-group-lg">
26
+ <label>Keyword</label>
27
+ <input
28
+ value={keyword}
29
+ onChange={(e) => setKeyword(e.target.value)}
30
+ onKeyDown={(e) => e.key === "Enter" && handleAnalyze()}
31
+ placeholder="e.g. Epstein, flight, island"
32
+ />
33
+ </div>
34
+ <button
35
+ className="btn btn-primary"
36
+ onClick={handleAnalyze}
37
+ disabled={loading || !keyword.trim()}
38
+ style={{ height: 38 }}
39
+ >
40
+ {loading ? "Analyzing..." : "Analyze"}
41
+ </button>
42
+ </div>
43
+ </div>
44
+
45
+ {error && <StatusMessage type="err" message={error} />}
46
+
47
+ {result && result.total_occurrences === 0 && (
48
+ <StatusMessage type="err" message={`No occurrences of "${result.keyword}" found in the corpus.`} />
49
+ )}
50
+
51
+ {result && result.meanings.length > 0 && (
52
+ <div className="panel">
53
+ <h2>
54
+ "{result.keyword}" — {result.total_occurrences} occurrences, {result.meanings.length} meaning{result.meanings.length > 1 ? "s" : ""}
55
+ </h2>
56
+
57
+ <div className="flex-col gap-3">
58
+ {result.meanings.map((meaning, idx) => (
59
+ <div key={meaning.cluster_id} className="result-card">
60
+ <div className="result-header">
61
+ <span style={{ fontWeight: 600, fontSize: "0.9rem" }}>
62
+ Meaning {idx + 1}
63
+ </span>
64
+ <div className="flex-row">
65
+ <span className="badge">
66
+ {meaning.occurrences} occurrence{meaning.occurrences > 1 ? "s" : ""}
67
+ </span>
68
+ <span
69
+ className="badge"
70
+ style={{
71
+ background: `rgba(${meaning.confidence > 0.5 ? "74, 222, 128" : "108, 140, 255"}, 0.15)`,
72
+ color: meaning.confidence > 0.5 ? "var(--ok)" : "var(--accent)",
73
+ }}
74
+ >
75
+ {(meaning.confidence * 100).toFixed(1)}%
76
+ </span>
77
+ </div>
78
+ </div>
79
+
80
+ {/* Associated words bar chart */}
81
+ <div className="mt-2">
82
+ {meaning.associated_words.map((aw) => {
83
+ const maxScore = meaning.associated_words[0]?.score || 1;
84
+ const pct = Math.round((aw.score / maxScore) * 100);
85
+ return (
86
+ <div key={aw.word} className="context-bar-row">
87
+ <span className="context-bar-label">{aw.word}</span>
88
+ <div className="context-bar-track">
89
+ <div className="context-bar-fill" style={{ width: `${pct}%` }} />
90
+ </div>
91
+ <span className="context-bar-value">{(aw.score * 100).toFixed(0)}</span>
92
+ </div>
93
+ );
94
+ })}
95
+ </div>
96
+
97
+ {/* Example snippets */}
98
+ {meaning.example_contexts.length > 0 && (
99
+ <div className="mt-2">
100
+ <div className="section-label">Example contexts</div>
101
+ {meaning.example_contexts.map((ex, i) => (
102
+ <div key={i} className="context-snippet">
103
+ <span className="context-snippet-source">{ex.doc_id}</span>
104
+ {ex.snippet}
105
+ </div>
106
+ ))}
107
+ </div>
108
+ )}
109
+ </div>
110
+ ))}
111
+ </div>
112
+ </div>
113
+ )}
114
+ </div>
115
+ );
116
+ }
frontend/src/components/DatasetPanel.tsx ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useEffect } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+ import type { DatasetInfo, DatasetLoadResponse, DatasetPreviewDoc } from "../types";
4
+ import StatusMessage from "./StatusMessage";
5
+ import MetricCard from "./MetricCard";
6
+ import Toggle from "./Toggle";
7
+ import Select from "./Select";
8
+ import Switch from "./Switch";
9
+ import LogViewer from "./LogViewer";
10
+
11
+ interface Props {
12
+ onStatsUpdate?: (stats: any) => void;
13
+ }
14
+
15
+ export default function DatasetPanel({ onStatsUpdate }: Props) {
16
+ const [info, setInfo] = useState<DatasetInfo | null>(null);
17
+ const [error, setError] = useState("");
18
+
19
+ // Load config
20
+ const [source, setSource] = useState<"raw" | "embeddings">("raw");
21
+ const [maxDocs, setMaxDocs] = useState(500);
22
+ const [minTextLen, setMinTextLen] = useState(100);
23
+ const [sourceFilter, setSourceFilter] = useState("");
24
+ const [loadAll, setLoadAll] = useState(true);
25
+ const [buildIndex, setBuildIndex] = useState(true);
26
+ const [loading, setLoading] = useState(false);
27
+ const [loadResult, setLoadResult] = useState<DatasetLoadResponse | null>(null);
28
+ const [showAdvanced, setShowAdvanced] = useState(false);
29
+
30
+ // Preview
31
+ const [previewDocs, setPreviewDocs] = useState<DatasetPreviewDoc[]>([]);
32
+ const [previewLoading, setPreviewLoading] = useState(false);
33
+
34
+ useEffect(() => {
35
+ api.datasetInfo().then(setInfo).catch((err) => {
36
+ setError(getErrorMessage(err));
37
+ });
38
+ }, []);
39
+
40
+ async function handlePreview() {
41
+ setPreviewLoading(true); setError("");
42
+ try {
43
+ const res = await api.datasetPreview(10, sourceFilter || undefined);
44
+ setPreviewDocs(res.documents);
45
+ } catch (err) {
46
+ setError(getErrorMessage(err));
47
+ } finally {
48
+ setPreviewLoading(false);
49
+ }
50
+ }
51
+
52
+ async function handleLoad() {
53
+ setLoading(true); setError(""); setLoadResult(null);
54
+ try {
55
+ const res = await api.datasetLoad({
56
+ source,
57
+ max_docs: loadAll ? 100000 : maxDocs,
58
+ min_text_length: loadAll ? 0 : minTextLen,
59
+ source_filter: sourceFilter || undefined,
60
+ build_index: buildIndex,
61
+ });
62
+ setLoadResult(res);
63
+ if (onStatsUpdate) {
64
+ try { const s = await api.getStats(); onStatsUpdate(s); } catch (e) {
65
+ console.warn("Failed to refresh stats after load:", e);
66
+ }
67
+ }
68
+ } catch (err) {
69
+ setError(getErrorMessage(err));
70
+ } finally {
71
+ setLoading(false);
72
+ }
73
+ }
74
+
75
+ return (
76
+ <div>
77
+ {/* Info */}
78
+ <div className="panel">
79
+ <h2>Epstein Files Dataset</h2>
80
+ <p className="panel-desc">
81
+ Load documents from the publicly released U.S. House Oversight Committee Epstein Files
82
+ via HuggingFace. Two sources available:
83
+ </p>
84
+
85
+ {info && (
86
+ <div style={{ display: "flex", gap: 12, flexWrap: "wrap", marginBottom: 16 }}>
87
+ <div className={`result-card ${source === "raw" ? "result-card-selected" : ""}`}
88
+ style={{ flex: "1 1 280px", cursor: "pointer" }}
89
+ onClick={() => setSource("raw")}>
90
+ <div className="result-header">
91
+ <strong>Raw Text Documents</strong>
92
+ <span className="badge">{info.raw_texts.size_mb} MB</span>
93
+ </div>
94
+ <div className="result-text">{info.raw_texts.description}</div>
95
+ <div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
96
+ Columns: {info.raw_texts.columns?.join(", ")}
97
+ </div>
98
+ </div>
99
+ <div className={`result-card ${source === "embeddings" ? "result-card-selected" : ""}`}
100
+ style={{ flex: "1 1 280px", cursor: "pointer" }}
101
+ onClick={() => setSource("embeddings")}>
102
+ <div className="result-header">
103
+ <strong>Pre-computed Embeddings</strong>
104
+ <span className="badge">{info.embeddings.vector_dim}d</span>
105
+ </div>
106
+ <div className="result-text">{info.embeddings.description}</div>
107
+ <div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
108
+ Model: {info.embeddings.model}
109
+ </div>
110
+ </div>
111
+ </div>
112
+ )}
113
+
114
+ <Toggle
115
+ options={[
116
+ { value: "raw", label: "Raw Texts" },
117
+ { value: "embeddings", label: "ChromaDB Embeddings" },
118
+ ]}
119
+ value={source}
120
+ onChange={(v) => setSource(v as "raw" | "embeddings")}
121
+ />
122
+ </div>
123
+
124
+ {/* Load actions + advanced config */}
125
+ <div className="panel">
126
+ <h2>Load Dataset</h2>
127
+ <div style={{ display: "flex", gap: 8, marginBottom: 12 }}>
128
+ <button className="btn btn-primary" onClick={handleLoad}
129
+ disabled={loading}>
130
+ {loading ? <><span className="spinner" /> Loading Dataset...</> : "Load into Engine"}
131
+ </button>
132
+ {source === "raw" && (
133
+ <button className="btn btn-secondary" onClick={handlePreview}
134
+ disabled={previewLoading}>
135
+ {previewLoading ? "Loading..." : "Preview Documents"}
136
+ </button>
137
+ )}
138
+ </div>
139
+
140
+ <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
141
+ {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
142
+ </button>
143
+
144
+ {showAdvanced && (
145
+ <div className="advanced-section">
146
+ <div className="form-row">
147
+ <div className="form-group" style={{ maxWidth: 200 }}>
148
+ <label>Load All Documents</label>
149
+ <Switch checked={loadAll} onChange={setLoadAll}
150
+ label={loadAll ? "Yes (no limits)" : "No (use filters below)"} />
151
+ </div>
152
+ {!loadAll && (
153
+ <>
154
+ <div className="form-group" style={{ maxWidth: 140 }}>
155
+ <label>Max Documents</label>
156
+ <input type="number" value={maxDocs} onChange={e => setMaxDocs(+e.target.value)}
157
+ min={10} max={100000} />
158
+ </div>
159
+ {source === "raw" && (
160
+ <div className="form-group" style={{ maxWidth: 140 }}>
161
+ <label>Min Text Length</label>
162
+ <input type="number" value={minTextLen} onChange={e => setMinTextLen(+e.target.value)}
163
+ min={0} max={10000} />
164
+ </div>
165
+ )}
166
+ </>
167
+ )}
168
+ {source === "raw" && (
169
+ <div className="form-group" style={{ maxWidth: 220 }}>
170
+ <label>Source Filter</label>
171
+ <Select
172
+ options={[
173
+ { value: "", label: "All sources" },
174
+ { value: "TEXT-", label: "TEXT- (native text files)" },
175
+ { value: "IMAGES-", label: "IMAGES- (OCR from images)" },
176
+ ]}
177
+ value={sourceFilter}
178
+ onChange={setSourceFilter}
179
+ />
180
+ </div>
181
+ )}
182
+ <div className="form-group" style={{ maxWidth: 200 }}>
183
+ <label>Build Index</label>
184
+ <Switch checked={buildIndex} onChange={setBuildIndex}
185
+ label={buildIndex ? "Yes (ready to search)" : "No (load only)"} />
186
+ </div>
187
+ </div>
188
+ </div>
189
+ )}
190
+
191
+ {loading && (
192
+ <StatusMessage type="loading"
193
+ message="Downloading from HuggingFace and indexing. This may take several minutes for large datasets..." />
194
+ )}
195
+
196
+ <LogViewer active={loading} />
197
+ </div>
198
+
199
+ {error && <StatusMessage type="err" message={error} />}
200
+
201
+ {/* Load result */}
202
+ {loadResult && (
203
+ <div className="panel">
204
+ <h2>Dataset Loaded</h2>
205
+ <div className="metric-grid mb-2">
206
+ {loadResult.documents_loaded !== undefined && (
207
+ <MetricCard value={loadResult.documents_loaded} label="Documents" />
208
+ )}
209
+ {loadResult.documents_created !== undefined && (
210
+ <MetricCard value={loadResult.documents_created} label="Documents" />
211
+ )}
212
+ {(loadResult.total_chunks || loadResult.chunks_indexed) && (
213
+ <MetricCard value={loadResult.total_chunks || loadResult.chunks_indexed || 0} label="Chunks" />
214
+ )}
215
+ {loadResult.chromadb_vectors !== undefined && (
216
+ <MetricCard value={loadResult.chromadb_vectors} label="Vectors Imported" />
217
+ )}
218
+ <MetricCard value={`${loadResult.seconds}s`} label="Time" />
219
+ </div>
220
+ <StatusMessage type="ok"
221
+ message={loadResult.index_built
222
+ ? "Dataset loaded and FAISS index built. You can now search, analyze keywords, and run evaluations."
223
+ : "Dataset loaded. Build the index from the Setup tab to enable search."} />
224
+ </div>
225
+ )}
226
+
227
+ {/* Preview */}
228
+ {previewDocs.length > 0 && (
229
+ <div className="panel">
230
+ <h2>Document Preview ({previewDocs.length} docs)</h2>
231
+ {previewDocs.map((doc, i) => (
232
+ <div key={i} className="result-card" style={{ marginBottom: 8 }}>
233
+ <div className="result-header">
234
+ <span style={{ fontWeight: 600, fontSize: "0.85rem" }}>{doc.filename}</span>
235
+ <span className="badge">{(doc.text_length / 1000).toFixed(1)}K chars</span>
236
+ </div>
237
+ <div className="result-text" style={{ whiteSpace: "pre-wrap" }}>
238
+ {doc.text_preview}
239
+ </div>
240
+ </div>
241
+ ))}
242
+ </div>
243
+ )}
244
+ </div>
245
+ );
246
+ }
frontend/src/components/EngineSetup.tsx ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+ import type { CorpusStats } from "../types";
4
+ import StatusMessage from "./StatusMessage";
5
+ import Select from "./Select";
6
+
7
+ interface Props {
8
+ onStatsUpdate: (stats: CorpusStats) => void;
9
+ }
10
+
11
+ const MODELS = [
12
+ { value: "all-MiniLM-L6-v2", label: "all-MiniLM-L6-v2 (fast, 384-dim)" },
13
+ { value: "all-mpnet-base-v2", label: "all-mpnet-base-v2 (best quality, 768-dim)" },
14
+ { value: "BAAI/bge-large-en-v1.5", label: "BAAI/bge-large-en-v1.5 (high accuracy, 1024-dim)" },
15
+ ];
16
+
17
+ export default function EngineSetup({ onStatsUpdate }: Props) {
18
+ const [model, setModel] = useState("all-MiniLM-L6-v2");
19
+ const [chunkSize, setChunkSize] = useState(512);
20
+ const [chunkOverlap, setChunkOverlap] = useState(128);
21
+ const [batchSize, setBatchSize] = useState(64);
22
+
23
+ const [docId, setDocId] = useState("");
24
+ const [docText, setDocText] = useState("");
25
+
26
+ const [showAdvanced, setShowAdvanced] = useState(false);
27
+ const [status, setStatus] = useState<{ type: "ok" | "err" | "loading"; msg: string } | null>(null);
28
+ const [initialized, setInitialized] = useState(false);
29
+ const [docsAdded, setDocsAdded] = useState<string[]>([]);
30
+
31
+ async function handleInit() {
32
+ setStatus({ type: "loading", msg: "Loading model..." });
33
+ try {
34
+ const res = await api.init({
35
+ model_name: model,
36
+ chunk_size: chunkSize,
37
+ chunk_overlap: chunkOverlap,
38
+ batch_size: batchSize,
39
+ });
40
+ setInitialized(true);
41
+ setDocsAdded([]);
42
+ setStatus({ type: "ok", msg: `Model "${res.model}" loaded in ${res.load_time_seconds}s` });
43
+ } catch (e: unknown) {
44
+ setStatus({ type: "err", msg: getErrorMessage(e) });
45
+ }
46
+ }
47
+
48
+ async function handleAddDoc() {
49
+ if (!docId.trim() || !docText.trim()) return;
50
+ setStatus({ type: "loading", msg: `Adding document "${docId}"...` });
51
+ try {
52
+ const res = await api.addDocument({ doc_id: docId, text: docText });
53
+ setDocsAdded((prev) => [...prev, res.doc_id]);
54
+ setStatus({ type: "ok", msg: `Added "${res.doc_id}": ${res.num_chunks} chunks` });
55
+ setDocId("");
56
+ setDocText("");
57
+ } catch (e: unknown) {
58
+ setStatus({ type: "err", msg: getErrorMessage(e) });
59
+ }
60
+ }
61
+
62
+ async function handleBuildIndex() {
63
+ setStatus({ type: "loading", msg: "Building FAISS index..." });
64
+ try {
65
+ const res = await api.buildIndex();
66
+ setStatus({
67
+ type: "ok",
68
+ msg: `Index built: ${res.total_chunks} vectors (dim=${res.embedding_dim}) in ${res.build_time_seconds}s`,
69
+ });
70
+ const stats = await api.getStats();
71
+ onStatsUpdate(stats);
72
+ } catch (e: unknown) {
73
+ setStatus({ type: "err", msg: getErrorMessage(e) });
74
+ }
75
+ }
76
+
77
+ return (
78
+ <div>
79
+ {/* Step 1: Initialize engine */}
80
+ <div className="panel">
81
+ <h2>1. Initialize Engine</h2>
82
+ <div className="form-row">
83
+ <div className="form-group">
84
+ <label>Model</label>
85
+ <Select options={MODELS} value={model} onChange={setModel} />
86
+ </div>
87
+ </div>
88
+
89
+ <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
90
+ {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
91
+ </button>
92
+
93
+ {showAdvanced && (
94
+ <div className="advanced-section">
95
+ <div className="form-row">
96
+ <div className="form-group form-group-md">
97
+ <label>Chunk Size</label>
98
+ <input type="number" value={chunkSize} onChange={(e) => setChunkSize(+e.target.value)} />
99
+ </div>
100
+ <div className="form-group form-group-md">
101
+ <label>Overlap</label>
102
+ <input type="number" value={chunkOverlap} onChange={(e) => setChunkOverlap(+e.target.value)} />
103
+ </div>
104
+ <div className="form-group form-group-md">
105
+ <label>Batch Size</label>
106
+ <input type="number" value={batchSize} onChange={(e) => setBatchSize(+e.target.value)} />
107
+ </div>
108
+ </div>
109
+ </div>
110
+ )}
111
+
112
+ <button className="btn btn-primary" onClick={handleInit} style={{ marginTop: 8 }}>
113
+ Initialize
114
+ </button>
115
+ </div>
116
+
117
+ {/* Step 2: Add documents */}
118
+ <div className="panel">
119
+ <h2>2. Add Documents</h2>
120
+ {docsAdded.length > 0 && (
121
+ <div style={{ marginBottom: 12 }}>
122
+ {docsAdded.map((id) => (
123
+ <span key={id} className="tag">{id}</span>
124
+ ))}
125
+ </div>
126
+ )}
127
+ <div className="form-row">
128
+ <div className="form-group form-group-lg">
129
+ <label>Document ID</label>
130
+ <input
131
+ value={docId}
132
+ onChange={(e) => setDocId(e.target.value)}
133
+ placeholder="e.g. chapter_1"
134
+ disabled={!initialized}
135
+ />
136
+ </div>
137
+ </div>
138
+ <div className="form-group mb-2">
139
+ <label>Document Text</label>
140
+ <textarea
141
+ value={docText}
142
+ onChange={(e) => setDocText(e.target.value)}
143
+ placeholder="Paste your document text here..."
144
+ rows={8}
145
+ disabled={!initialized}
146
+ />
147
+ </div>
148
+ <button className="btn btn-primary" onClick={handleAddDoc} disabled={!initialized || !docId || !docText}>
149
+ Add Document
150
+ </button>
151
+ </div>
152
+
153
+ {/* Step 3: Build index */}
154
+ <div className="panel">
155
+ <h2>3. Build Index</h2>
156
+ <p className="panel-desc">
157
+ Embeds all chunks and builds a FAISS index for fast similarity search.
158
+ This must be done after adding all documents.
159
+ </p>
160
+ <button
161
+ className="btn btn-primary"
162
+ onClick={handleBuildIndex}
163
+ disabled={!initialized || docsAdded.length === 0}
164
+ >
165
+ Build Index
166
+ </button>
167
+ </div>
168
+
169
+ {status && <StatusMessage type={status.type} message={status.msg} />}
170
+ </div>
171
+ );
172
+ }
frontend/src/components/EvaluationDashboard.tsx ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import {
3
+ BarChart,
4
+ Bar,
5
+ XAxis,
6
+ YAxis,
7
+ CartesianGrid,
8
+ Tooltip,
9
+ ResponsiveContainer,
10
+ Cell,
11
+ } from "recharts";
12
+ import { api, getErrorMessage } from "../api";
13
+ import type { EvalSection, SimilarityDistribution, DisambiguationMetric, RetrievalMetric } from "../types";
14
+ import StatusMessage from "./StatusMessage";
15
+ import MetricCard from "./MetricCard";
16
+
17
+ // ---- Structured form types ----
18
+
19
+ interface GtRow {
20
+ text: string;
21
+ meaning: string;
22
+ }
23
+
24
+ interface RetrievalRow {
25
+ query: string;
26
+ relevantText: string;
27
+ }
28
+
29
+ // ---- Example data ----
30
+
31
+ const EXAMPLE_KEYWORD = "pizza";
32
+ const EXAMPLE_MEANINGS = [
33
+ "school, education, and academic activities like homework and tests",
34
+ "food, Italian cuisine, restaurant, cooking, and eating",
35
+ ];
36
+ const EXAMPLE_GT: GtRow[] = [
37
+ { text: "I love pizza so much, I go there every day", meaning: "school" },
38
+ { text: "pizza gives me homework", meaning: "school" },
39
+ { text: "she made the best margherita pizza in the city", meaning: "food" },
40
+ { text: "pizza dough recipe used tipo 00 flour", meaning: "food" },
41
+ { text: "The pizza test is going to be so hard", meaning: "school" },
42
+ { text: "This pizza is amazing, the crust is perfectly crispy", meaning: "food" },
43
+ ];
44
+
45
+ const EXAMPLE_RETRIEVAL: RetrievalRow[] = [
46
+ { query: "kids using secret code words for school", relevantText: "secret language" },
47
+ { query: "Italian restaurant with wood-fired oven", relevantText: "pizza" },
48
+ ];
49
+
50
+ // ---- Meaning label helpers ----
51
+
52
+ function getMeaningLabels(meanings: string[]): string[] {
53
+ return meanings.map((m) => {
54
+ const first = m.split(",")[0].trim();
55
+ return first.length > 20 ? first.slice(0, 20) : first;
56
+ });
57
+ }
58
+
59
+ // ---- Tab config ----
60
+
61
+ const EVAL_TABS: { id: EvalSection; label: string; desc: string }[] = [
62
+ {
63
+ id: "distribution",
64
+ label: "Distribution",
65
+ desc: "Analyze pairwise similarity distribution across your corpus. One-click — no setup needed.",
66
+ },
67
+ {
68
+ id: "disambiguation",
69
+ label: "Disambiguation",
70
+ desc: "Test whether the engine can tell apart different meanings of the same word. Provide example sentences and label each with the intended meaning.",
71
+ },
72
+ {
73
+ id: "retrieval",
74
+ label: "Retrieval",
75
+ desc: "Measure how well the engine finds relevant documents for a given query. Provide search queries and what text they should match.",
76
+ },
77
+ ];
78
+
79
+ export default function EvaluationDashboard() {
80
+ const [section, setSection] = useState<EvalSection>("distribution");
81
+ const [distrib, setDistrib] = useState<SimilarityDistribution | null>(null);
82
+ const [disambig, setDisambig] = useState<DisambiguationMetric[] | null>(null);
83
+ const [retrieval, setRetrieval] = useState<RetrievalMetric[] | null>(null);
84
+ const [loading, setLoading] = useState("");
85
+ const [error, setError] = useState("");
86
+
87
+ // Disambiguation structured form
88
+ const [keyword, setKeyword] = useState("");
89
+ const [meanings, setMeanings] = useState<string[]>(["", ""]);
90
+ const [gtRows, setGtRows] = useState<GtRow[]>([{ text: "", meaning: "" }]);
91
+
92
+ // Retrieval structured form
93
+ const [retRows, setRetRows] = useState<RetrievalRow[]>([{ query: "", relevantText: "" }]);
94
+
95
+ // ---- Distribution ----
96
+
97
+ async function fetchDistribution() {
98
+ setLoading("distrib");
99
+ setError("");
100
+ try {
101
+ setDistrib(await api.getSimilarityDistribution());
102
+ } catch (err) {
103
+ setError(getErrorMessage(err));
104
+ } finally {
105
+ setLoading("");
106
+ }
107
+ }
108
+
109
+ // ---- Disambiguation ----
110
+
111
+ function loadDisambiguationExample() {
112
+ setKeyword(EXAMPLE_KEYWORD);
113
+ setMeanings([...EXAMPLE_MEANINGS]);
114
+ setGtRows(EXAMPLE_GT.map((r) => ({ ...r })));
115
+ }
116
+
117
+ function updateMeaning(i: number, val: string) {
118
+ const next = [...meanings];
119
+ next[i] = val;
120
+ setMeanings(next);
121
+ }
122
+
123
+ function addMeaning() {
124
+ setMeanings([...meanings, ""]);
125
+ }
126
+
127
+ function removeMeaning(i: number) {
128
+ if (meanings.length <= 2) return;
129
+ setMeanings(meanings.filter((_, idx) => idx !== i));
130
+ // Update GT rows that referenced removed meaning
131
+ const labels = getMeaningLabels(meanings);
132
+ const removed = labels[i];
133
+ setGtRows(gtRows.map((r) => (r.meaning === removed ? { ...r, meaning: "" } : r)));
134
+ }
135
+
136
+ function updateGtRow(i: number, field: keyof GtRow, val: string) {
137
+ const next = [...gtRows];
138
+ next[i] = { ...next[i], [field]: val };
139
+ setGtRows(next);
140
+ }
141
+
142
+ function addGtRow() {
143
+ setGtRows([...gtRows, { text: "", meaning: "" }]);
144
+ }
145
+
146
+ function removeGtRow(i: number) {
147
+ if (gtRows.length <= 1) return;
148
+ setGtRows(gtRows.filter((_, idx) => idx !== i));
149
+ }
150
+
151
+ async function runDisambiguation() {
152
+ if (!keyword.trim()) { setError("Enter a keyword."); return; }
153
+ const validMeanings = meanings.filter((m) => m.trim());
154
+ if (validMeanings.length < 2) { setError("Add at least 2 meanings."); return; }
155
+ const validGt = gtRows.filter((r) => r.text.trim() && r.meaning);
156
+ if (validGt.length < 2) { setError("Add at least 2 labeled examples."); return; }
157
+
158
+ setLoading("disambig");
159
+ setError("");
160
+ try {
161
+ const labels = getMeaningLabels(meanings);
162
+ const ground_truth = validGt.map((r) => ({
163
+ keyword: keyword.trim(),
164
+ text: r.text,
165
+ true_meaning: r.meaning,
166
+ }));
167
+ const candidate_meanings: Record<string, string[]> = {
168
+ [keyword.trim()]: validMeanings,
169
+ };
170
+ // Map GT meaning labels back to full candidate strings for the API
171
+ // The API compares against candidates, so true_meaning should match a candidate label
172
+ // We use short labels for the dropdown, but the API uses them as-is for matching
173
+ const res = await api.evalDisambiguation({ ground_truth, candidate_meanings });
174
+ setDisambig(res.metrics);
175
+ } catch (e) {
176
+ setError(getErrorMessage(e));
177
+ } finally {
178
+ setLoading("");
179
+ }
180
+ }
181
+
182
+ // ---- Retrieval ----
183
+
184
+ function loadRetrievalExample() {
185
+ setRetRows(EXAMPLE_RETRIEVAL.map((r) => ({ ...r })));
186
+ }
187
+
188
+ function updateRetRow(i: number, field: keyof RetrievalRow, val: string) {
189
+ const next = [...retRows];
190
+ next[i] = { ...next[i], [field]: val };
191
+ setRetRows(next);
192
+ }
193
+
194
+ function addRetRow() {
195
+ setRetRows([...retRows, { query: "", relevantText: "" }]);
196
+ }
197
+
198
+ function removeRetRow(i: number) {
199
+ if (retRows.length <= 1) return;
200
+ setRetRows(retRows.filter((_, idx) => idx !== i));
201
+ }
202
+
203
+ async function runRetrieval() {
204
+ const valid = retRows.filter((r) => r.query.trim());
205
+ if (valid.length === 0) { setError("Add at least one query."); return; }
206
+
207
+ setLoading("retrieval");
208
+ setError("");
209
+ try {
210
+ const queries = valid.map((r) => ({
211
+ query: r.query,
212
+ relevant_texts: r.relevantText.trim() ? [r.relevantText.trim()] : [],
213
+ }));
214
+ const res = await api.evalRetrieval({ queries, k_values: [1, 3, 5, 10] });
215
+ setRetrieval(res.metrics);
216
+ } catch (e) {
217
+ setError(getErrorMessage(e));
218
+ } finally {
219
+ setLoading("");
220
+ }
221
+ }
222
+
223
+ // ---- Meaning labels for dropdown ----
224
+ const meaningLabels = getMeaningLabels(meanings);
225
+
226
+ return (
227
+ <div>
228
+ <nav className="subtabs mb-2">
229
+ {EVAL_TABS.map((t) => (
230
+ <button
231
+ key={t.id}
232
+ className={`subtab ${section === t.id ? "subtab-active" : ""}`}
233
+ onClick={() => { setSection(t.id); setError(""); }}
234
+ >
235
+ {t.label}
236
+ </button>
237
+ ))}
238
+ </nav>
239
+
240
+ <p className="panel-desc">{EVAL_TABS.find((t) => t.id === section)?.desc}</p>
241
+
242
+ {error && <StatusMessage type="err" message={error} />}
243
+
244
+ {/* ---- Similarity Distribution ---- */}
245
+ {section === "distribution" && (
246
+ <div className="panel">
247
+ <button className="btn btn-primary" onClick={fetchDistribution} disabled={loading === "distrib"}>
248
+ {loading === "distrib" ? "Computing..." : "Compute Distribution"}
249
+ </button>
250
+
251
+ {distrib && (
252
+ <div className="mt-2">
253
+ <div className="metric-grid mb-3">
254
+ {[
255
+ { label: "Mean", value: distrib.mean },
256
+ { label: "Std Dev", value: distrib.std },
257
+ { label: "Min", value: distrib.min },
258
+ { label: "Max", value: distrib.max },
259
+ ].map((m) => (
260
+ <MetricCard key={m.label} value={m.value.toFixed(3)} label={m.label} />
261
+ ))}
262
+ </div>
263
+
264
+ <h3>Histogram</h3>
265
+ <ResponsiveContainer width="100%" height={250}>
266
+ <BarChart data={distrib.histogram}>
267
+ <CartesianGrid strokeDasharray="3 3" stroke="var(--border)" />
268
+ <XAxis
269
+ dataKey="bin_start"
270
+ tick={{ fill: "var(--text-dim)", fontSize: 11 }}
271
+ tickFormatter={(v: number) => v.toFixed(1)}
272
+ />
273
+ <YAxis tick={{ fill: "var(--text-dim)", fontSize: 11 }} />
274
+ <Tooltip
275
+ contentStyle={{
276
+ background: "var(--surface)",
277
+ border: "1px solid var(--border)",
278
+ borderRadius: 6,
279
+ color: "var(--text)",
280
+ }}
281
+ formatter={(value: unknown) => [Number(value), "Count"]}
282
+ labelFormatter={(v: unknown) => `Similarity: ${Number(v).toFixed(2)}`}
283
+ />
284
+ <Bar dataKey="count" radius={[4, 4, 0, 0]}>
285
+ {distrib.histogram.map((entry, i) => (
286
+ <Cell
287
+ key={i}
288
+ fill={entry.bin_start >= 0.5 ? "var(--ok)" : entry.bin_start >= 0 ? "var(--accent)" : "var(--err)"}
289
+ />
290
+ ))}
291
+ </Bar>
292
+ </BarChart>
293
+ </ResponsiveContainer>
294
+
295
+ <h3 className="mt-2">Percentiles</h3>
296
+ <table className="data-table">
297
+ <thead>
298
+ <tr>
299
+ {Object.keys(distrib.percentiles).map((p) => (
300
+ <th key={p}>P{p}</th>
301
+ ))}
302
+ </tr>
303
+ </thead>
304
+ <tbody>
305
+ <tr>
306
+ {Object.values(distrib.percentiles).map((v, i) => (
307
+ <td key={i}>{v.toFixed(4)}</td>
308
+ ))}
309
+ </tr>
310
+ </tbody>
311
+ </table>
312
+ </div>
313
+ )}
314
+ </div>
315
+ )}
316
+
317
+ {/* ---- Disambiguation Evaluation ---- */}
318
+ {section === "disambiguation" && (
319
+ <div className="panel">
320
+ <div className="flex-row gap-2 mb-2">
321
+ <button className="btn btn-secondary" onClick={loadDisambiguationExample}>
322
+ Load Example
323
+ </button>
324
+ </div>
325
+
326
+ {/* Keyword */}
327
+ <div className="form-group mb-2" style={{ maxWidth: 300 }}>
328
+ <label>Keyword</label>
329
+ <input
330
+ value={keyword}
331
+ onChange={(e) => setKeyword(e.target.value)}
332
+ placeholder='e.g. "pizza"'
333
+ />
334
+ </div>
335
+
336
+ {/* Candidate Meanings */}
337
+ <div className="mb-2">
338
+ <label className="section-label">
339
+ Candidate Meanings
340
+ <span className="text-dim"> — describe each possible meaning</span>
341
+ </label>
342
+ {meanings.map((m, i) => (
343
+ <div key={i} className="flex-row gap-1 mb-1">
344
+ <span className="text-dim" style={{ minWidth: 24 }}>{i + 1}.</span>
345
+ <input
346
+ value={m}
347
+ onChange={(e) => updateMeaning(i, e.target.value)}
348
+ placeholder={`Meaning ${i + 1} description...`}
349
+ style={{ flex: 1 }}
350
+ />
351
+ {meanings.length > 2 && (
352
+ <button className="btn btn-secondary" onClick={() => removeMeaning(i)}>
353
+ &times;
354
+ </button>
355
+ )}
356
+ </div>
357
+ ))}
358
+ <button className="btn btn-secondary mt-1" onClick={addMeaning}>
359
+ + Add Meaning
360
+ </button>
361
+ </div>
362
+
363
+ {/* Ground Truth Examples */}
364
+ <div className="mb-2">
365
+ <label className="section-label">
366
+ Labeled Examples
367
+ <span className="text-dim"> — sentences using the keyword, with the correct meaning</span>
368
+ </label>
369
+ <table className="data-table">
370
+ <thead>
371
+ <tr>
372
+ <th style={{ width: "60%" }}>Sentence</th>
373
+ <th>Correct Meaning</th>
374
+ <th style={{ width: 40 }} />
375
+ </tr>
376
+ </thead>
377
+ <tbody>
378
+ {gtRows.map((row, i) => (
379
+ <tr key={i}>
380
+ <td>
381
+ <input
382
+ value={row.text}
383
+ onChange={(e) => updateGtRow(i, "text", e.target.value)}
384
+ placeholder="A sentence containing the keyword..."
385
+ style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
386
+ />
387
+ </td>
388
+ <td>
389
+ <select
390
+ value={row.meaning}
391
+ onChange={(e) => updateGtRow(i, "meaning", e.target.value)}
392
+ style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
393
+ >
394
+ <option value="">Select...</option>
395
+ {meaningLabels.map((label, j) => (
396
+ <option key={j} value={label}>{label}</option>
397
+ ))}
398
+ </select>
399
+ </td>
400
+ <td>
401
+ {gtRows.length > 1 && (
402
+ <button className="btn btn-secondary" onClick={() => removeGtRow(i)}>
403
+ &times;
404
+ </button>
405
+ )}
406
+ </td>
407
+ </tr>
408
+ ))}
409
+ </tbody>
410
+ </table>
411
+ <button className="btn btn-secondary mt-1" onClick={addGtRow}>
412
+ + Add Example
413
+ </button>
414
+ </div>
415
+
416
+ <button
417
+ className="btn btn-primary"
418
+ onClick={runDisambiguation}
419
+ disabled={loading === "disambig"}
420
+ >
421
+ {loading === "disambig" ? "Evaluating..." : "Run Evaluation"}
422
+ </button>
423
+
424
+ {disambig && disambig.map((m) => (
425
+ <div key={m.keyword} className="mt-3">
426
+ <h3>Results: "{m.keyword}" ({m.total_samples} samples)</h3>
427
+ <div className="metric-grid mb-2">
428
+ <MetricCard value={`${(m.accuracy * 100).toFixed(1)}%`} label="Accuracy" />
429
+ <MetricCard value={`${(m.weighted_f1 * 100).toFixed(1)}%`} label="Weighted F1" />
430
+ </div>
431
+
432
+ <h3>Per-Meaning Scores</h3>
433
+ <table className="data-table">
434
+ <thead>
435
+ <tr>
436
+ <th>Meaning</th>
437
+ <th>Precision</th>
438
+ <th>Recall</th>
439
+ <th>F1</th>
440
+ </tr>
441
+ </thead>
442
+ <tbody>
443
+ {Object.keys(m.per_meaning_f1).map((meaning) => (
444
+ <tr key={meaning}>
445
+ <td>{meaning}</td>
446
+ <td>{m.per_meaning_precision[meaning]?.toFixed(4) ?? "-"}</td>
447
+ <td>{m.per_meaning_recall[meaning]?.toFixed(4) ?? "-"}</td>
448
+ <td style={{ fontWeight: 700 }}>{m.per_meaning_f1[meaning]?.toFixed(4) ?? "-"}</td>
449
+ </tr>
450
+ ))}
451
+ </tbody>
452
+ </table>
453
+
454
+ {m.confusion_matrix && (
455
+ <>
456
+ <h3 className="mt-2">Confusion Matrix</h3>
457
+ <table className="data-table">
458
+ <thead>
459
+ <tr>
460
+ <th>True \ Predicted</th>
461
+ {Object.keys(m.per_meaning_f1).map((meaning) => (
462
+ <th key={meaning}>{meaning}</th>
463
+ ))}
464
+ </tr>
465
+ </thead>
466
+ <tbody>
467
+ {m.confusion_matrix.map((row, i) => (
468
+ <tr key={i}>
469
+ <td style={{ fontWeight: 600 }}>{Object.keys(m.per_meaning_f1)[i]}</td>
470
+ {row.map((val, j) => (
471
+ <td
472
+ key={j}
473
+ style={{
474
+ fontWeight: i === j ? 700 : 400,
475
+ color: i === j ? "var(--ok)" : val > 0 ? "var(--err)" : "var(--text-dim)",
476
+ }}
477
+ >
478
+ {val}
479
+ </td>
480
+ ))}
481
+ </tr>
482
+ ))}
483
+ </tbody>
484
+ </table>
485
+ </>
486
+ )}
487
+ </div>
488
+ ))}
489
+ </div>
490
+ )}
491
+
492
+ {/* ---- Retrieval Evaluation ---- */}
493
+ {section === "retrieval" && (
494
+ <div className="panel">
495
+ <div className="flex-row gap-2 mb-2">
496
+ <button className="btn btn-secondary" onClick={loadRetrievalExample}>
497
+ Load Example
498
+ </button>
499
+ </div>
500
+
501
+ <label className="section-label">
502
+ Search Queries
503
+ <span className="text-dim"> — enter queries and what text they should find</span>
504
+ </label>
505
+ <table className="data-table mb-2">
506
+ <thead>
507
+ <tr>
508
+ <th style={{ width: "50%" }}>Query</th>
509
+ <th>Expected Match (text snippet)</th>
510
+ <th style={{ width: 40 }} />
511
+ </tr>
512
+ </thead>
513
+ <tbody>
514
+ {retRows.map((row, i) => (
515
+ <tr key={i}>
516
+ <td>
517
+ <input
518
+ value={row.query}
519
+ onChange={(e) => updateRetRow(i, "query", e.target.value)}
520
+ placeholder="A search query..."
521
+ style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
522
+ />
523
+ </td>
524
+ <td>
525
+ <input
526
+ value={row.relevantText}
527
+ onChange={(e) => updateRetRow(i, "relevantText", e.target.value)}
528
+ placeholder="Text that should match..."
529
+ style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
530
+ />
531
+ </td>
532
+ <td>
533
+ {retRows.length > 1 && (
534
+ <button className="btn btn-secondary" onClick={() => removeRetRow(i)}>
535
+ &times;
536
+ </button>
537
+ )}
538
+ </td>
539
+ </tr>
540
+ ))}
541
+ </tbody>
542
+ </table>
543
+ <div className="flex-row gap-2 mb-2">
544
+ <button className="btn btn-secondary" onClick={addRetRow}>
545
+ + Add Query
546
+ </button>
547
+ <button
548
+ className="btn btn-primary"
549
+ onClick={runRetrieval}
550
+ disabled={loading === "retrieval"}
551
+ >
552
+ {loading === "retrieval" ? "Evaluating..." : "Run Evaluation"}
553
+ </button>
554
+ </div>
555
+
556
+ {retrieval && (
557
+ <div className="mt-2">
558
+ <table className="data-table">
559
+ <thead>
560
+ <tr>
561
+ <th>Query</th>
562
+ <th>MRR</th>
563
+ <th>P@1</th>
564
+ <th>P@3</th>
565
+ <th>P@5</th>
566
+ <th>Top Score</th>
567
+ </tr>
568
+ </thead>
569
+ <tbody>
570
+ {retrieval.map((m, i) => (
571
+ <tr key={i}>
572
+ <td style={{ maxWidth: 300 }}>{m.query.length > 50 ? m.query.slice(0, 50) + "..." : m.query}</td>
573
+ <td>{m.mrr.toFixed(3)}</td>
574
+ <td>{m.precision_at_k["1"]?.toFixed(2) ?? "-"}</td>
575
+ <td>{m.precision_at_k["3"]?.toFixed(2) ?? "-"}</td>
576
+ <td>{m.precision_at_k["5"]?.toFixed(2) ?? "-"}</td>
577
+ <td>{m.top_score.toFixed(3)}</td>
578
+ </tr>
579
+ ))}
580
+ </tbody>
581
+ </table>
582
+
583
+ <div className="metric-grid mt-3">
584
+ <MetricCard
585
+ value={(retrieval.reduce((s, m) => s + m.mrr, 0) / retrieval.length).toFixed(3)}
586
+ label="Mean MRR"
587
+ />
588
+ <MetricCard
589
+ value={(retrieval.reduce((s, m) => s + (m.precision_at_k["5"] ?? 0), 0) / retrieval.length).toFixed(3)}
590
+ label="Mean P@5"
591
+ />
592
+ <MetricCard
593
+ value={(retrieval.reduce((s, m) => s + m.top_score, 0) / retrieval.length).toFixed(3)}
594
+ label="Mean Top Score"
595
+ />
596
+ </div>
597
+ </div>
598
+ )}
599
+ </div>
600
+ )}
601
+ </div>
602
+ );
603
+ }
frontend/src/components/KeywordAnalysis.tsx ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api } from "../api";
3
+ import type { KeywordAnalysisResponse } from "../types";
4
+ import { useApiCall } from "../hooks/useApiCall";
5
+ import ScoreBar from "./ScoreBar";
6
+ import StatusMessage from "./StatusMessage";
7
+
8
+ export default function KeywordAnalysis() {
9
+ const [keyword, setKeyword] = useState("");
10
+ const [topK, setTopK] = useState(5);
11
+ const [threshold, setThreshold] = useState(0.4);
12
+ const { data: analysis, loading, error, run } = useApiCall<KeywordAnalysisResponse>();
13
+
14
+ async function handleAnalyze() {
15
+ if (!keyword.trim()) return;
16
+ await run(() => api.analyzeKeyword({ keyword, top_k: topK, cluster_threshold: threshold }));
17
+ }
18
+
19
+ return (
20
+ <div>
21
+ <div className="panel">
22
+ <h2>Keyword Analysis</h2>
23
+ <p className="panel-desc">
24
+ Find all occurrences of a keyword, cluster them by contextual meaning,
25
+ and discover semantically similar passages for each meaning.
26
+ </p>
27
+ <div className="form-row">
28
+ <div className="form-group">
29
+ <label>Keyword</label>
30
+ <input
31
+ value={keyword}
32
+ onChange={(e) => setKeyword(e.target.value)}
33
+ placeholder="e.g. pizza"
34
+ onKeyDown={(e) => e.key === "Enter" && handleAnalyze()}
35
+ />
36
+ </div>
37
+ <div className="form-group form-group-sm">
38
+ <label>Top K</label>
39
+ <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
40
+ </div>
41
+ <div className="form-group form-group-md">
42
+ <label>Cluster Threshold</label>
43
+ <input type="number" value={threshold} onChange={(e) => setThreshold(+e.target.value)} min={0.1} max={1} step={0.05} />
44
+ </div>
45
+ <div className="form-group form-group-sm">
46
+ <label>&nbsp;</label>
47
+ <button className="btn btn-primary" onClick={handleAnalyze} disabled={loading || !keyword.trim()}>
48
+ {loading ? "Analyzing..." : "Analyze"}
49
+ </button>
50
+ </div>
51
+ </div>
52
+ </div>
53
+
54
+ {error && <StatusMessage type="err" message={error} />}
55
+
56
+ {analysis && (
57
+ <div className="panel">
58
+ <h3>
59
+ "{analysis.keyword}" &mdash; {analysis.total_occurrences} occurrence(s),{" "}
60
+ {analysis.meaning_clusters.length} meaning cluster(s)
61
+ </h3>
62
+
63
+ {analysis.meaning_clusters.map((cluster) => (
64
+ <div key={cluster.cluster_id} className="result-card mt-2">
65
+ <div className="result-header">
66
+ <div>
67
+ <strong>Cluster {cluster.cluster_id}</strong>{" "}
68
+ <span className="tag">{cluster.size} occurrence(s)</span>
69
+ </div>
70
+ </div>
71
+
72
+ <div className="mt-1 mb-2">
73
+ <div className="section-label">Contexts:</div>
74
+ {cluster.contexts.map((ctx, i) => (
75
+ <div key={i} className="result-text" style={{ marginBottom: 4, paddingLeft: 12 }}>
76
+ <span className="badge" style={{ marginRight: 6 }}>{ctx.doc_id}</span>
77
+ {ctx.text.slice(0, 200)}...
78
+ </div>
79
+ ))}
80
+ </div>
81
+
82
+ <div>
83
+ <div className="section-label">Similar passages:</div>
84
+ {cluster.similar_passages.map((sp) => (
85
+ <div key={sp.rank} className="flex-row" style={{ alignItems: "start", marginBottom: 6 }}>
86
+ <ScoreBar score={sp.score} />
87
+ <span className="result-text" style={{ flex: 1 }}>
88
+ <span className="badge" style={{ marginRight: 4 }}>{sp.doc_id}</span>
89
+ {sp.text.slice(0, 150)}...
90
+ </span>
91
+ </div>
92
+ ))}
93
+ </div>
94
+ </div>
95
+ ))}
96
+ </div>
97
+ )}
98
+ </div>
99
+ );
100
+ }
frontend/src/components/KeywordMatcher.tsx ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+ import type { MatchResponse } from "../types";
4
+ import { useApiCall } from "../hooks/useApiCall";
5
+ import ScoreBar from "./ScoreBar";
6
+ import StatusMessage from "./StatusMessage";
7
+
8
+ export default function KeywordMatcher() {
9
+ const [keyword, setKeyword] = useState("");
10
+ const [meaningsText, setMeaningsText] = useState("");
11
+ const { data: results, loading, error, setError, run } = useApiCall<MatchResponse>();
12
+
13
+ async function handleMatch() {
14
+ if (!keyword.trim() || !meaningsText.trim()) return;
15
+ const candidates = meaningsText.split("\n").map((s) => s.trim()).filter(Boolean);
16
+ if (candidates.length < 2) {
17
+ setError("Provide at least 2 candidate meanings (one per line).");
18
+ return;
19
+ }
20
+ await run(() => api.matchKeyword({ keyword, candidate_meanings: candidates }));
21
+ }
22
+
23
+ return (
24
+ <div>
25
+ <div className="panel">
26
+ <h2>Keyword Meaning Matcher</h2>
27
+ <p className="panel-desc">
28
+ Match each occurrence of a keyword to the most likely intended meaning.
29
+ For example: keyword "pizza" with candidates "food" and "school".
30
+ </p>
31
+ <div className="form-row">
32
+ <div className="form-group form-group-lg">
33
+ <label>Keyword</label>
34
+ <input value={keyword} onChange={(e) => setKeyword(e.target.value)} placeholder="e.g. pizza" />
35
+ </div>
36
+ </div>
37
+ <div className="form-group mb-2">
38
+ <label>Candidate Meanings (one per line)</label>
39
+ <textarea
40
+ value={meaningsText}
41
+ onChange={(e) => setMeaningsText(e.target.value)}
42
+ placeholder={`Italian food made with dough, tomato sauce, and cheese\nSchool, education, and academic activities`}
43
+ rows={4}
44
+ />
45
+ </div>
46
+ <button className="btn btn-primary" onClick={handleMatch} disabled={loading || !keyword.trim() || !meaningsText.trim()}>
47
+ {loading ? "Matching..." : "Match"}
48
+ </button>
49
+ </div>
50
+
51
+ {error && <StatusMessage type="err" message={error} />}
52
+
53
+ {results && (
54
+ <div className="panel">
55
+ <h3>Matches for "{results.keyword}" ({results.matches.length} occurrences)</h3>
56
+
57
+ {results.matches.map((m, idx) => (
58
+ <div key={idx} className="result-card mt-1">
59
+ <div className="result-header">
60
+ <div>
61
+ <span className="badge">{m.doc_id}</span>{" "}
62
+ <span className="tag">chunk {m.chunk_index}</span>
63
+ </div>
64
+ <span className="tag tag-best">{m.best_match}</span>
65
+ </div>
66
+ <div className="result-text mb-1">{m.text.slice(0, 250)}...</div>
67
+ <div className="flex-row flex-wrap gap-2">
68
+ {Object.entries(m.all_scores).map(([meaning, score]) => (
69
+ <div key={meaning} style={{ flex: "1 1 200px" }}>
70
+ <div
71
+ style={{
72
+ fontSize: "0.78rem",
73
+ color: meaning === m.best_match ? "var(--ok)" : "var(--text-dim)",
74
+ fontWeight: meaning === m.best_match ? 700 : 400,
75
+ marginBottom: 2,
76
+ }}
77
+ >
78
+ {meaning.slice(0, 60)}
79
+ </div>
80
+ <ScoreBar score={score} />
81
+ </div>
82
+ ))}
83
+ </div>
84
+ </div>
85
+ ))}
86
+ </div>
87
+ )}
88
+ </div>
89
+ );
90
+ }
frontend/src/components/LogViewer.tsx ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useEffect, useRef } from "react";
2
+
3
+ interface Props {
4
+ /** Whether to actively stream logs */
5
+ active: boolean;
6
+ }
7
+
8
+ export default function LogViewer({ active }: Props) {
9
+ const [lines, setLines] = useState<string[]>([]);
10
+ const containerRef = useRef<HTMLDivElement>(null);
11
+
12
+ useEffect(() => {
13
+ if (!active) return;
14
+
15
+ setLines([]);
16
+ const evtSource = new EventSource("/api/logs/stream");
17
+
18
+ evtSource.onmessage = (event) => {
19
+ setLines((prev) => {
20
+ const next = [...prev, event.data];
21
+ // Keep last 200 lines
22
+ return next.length > 200 ? next.slice(-200) : next;
23
+ });
24
+ };
25
+
26
+ evtSource.onerror = () => {
27
+ // SSE will auto-reconnect, no action needed
28
+ };
29
+
30
+ return () => {
31
+ evtSource.close();
32
+ };
33
+ }, [active]);
34
+
35
+ useEffect(() => {
36
+ // Auto-scroll to bottom
37
+ if (containerRef.current) {
38
+ containerRef.current.scrollTop = containerRef.current.scrollHeight;
39
+ }
40
+ }, [lines]);
41
+
42
+ if (!active && lines.length === 0) return null;
43
+
44
+ return (
45
+ <div
46
+ ref={containerRef}
47
+ style={{
48
+ background: "#0a0c10",
49
+ border: "1px solid var(--border)",
50
+ borderRadius: "var(--radius)",
51
+ padding: "10px 14px",
52
+ marginTop: 12,
53
+ maxHeight: 220,
54
+ overflowY: "auto",
55
+ fontFamily: "'JetBrains Mono', 'Fira Code', 'Consolas', monospace",
56
+ fontSize: "0.75rem",
57
+ lineHeight: 1.7,
58
+ color: "var(--text-dim)",
59
+ }}
60
+ >
61
+ {lines.length === 0 && active && (
62
+ <span style={{ color: "var(--text-dim)", opacity: 0.5 }}>Waiting for logs...</span>
63
+ )}
64
+ {lines.map((line, i) => (
65
+ <div key={i} style={{ whiteSpace: "pre-wrap", wordBreak: "break-all" }}>
66
+ {line}
67
+ </div>
68
+ ))}
69
+ </div>
70
+ );
71
+ }
frontend/src/components/MetricCard.tsx ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ interface Props {
2
+ value: string | number;
3
+ label: string;
4
+ valueColor?: string;
5
+ }
6
+
7
+ export default function MetricCard({ value, label, valueColor }: Props) {
8
+ return (
9
+ <div className="metric-card">
10
+ <div className="metric-value" style={valueColor ? { color: valueColor } : undefined}>
11
+ {value}
12
+ </div>
13
+ <div className="metric-label">{label}</div>
14
+ </div>
15
+ );
16
+ }
frontend/src/components/ScoreBar.tsx ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { scoreColor } from "../utils/colors";
2
+
3
+ interface ScoreBarProps {
4
+ score: number;
5
+ max?: number;
6
+ }
7
+
8
+ export default function ScoreBar({ score, max = 1 }: ScoreBarProps) {
9
+ const pct = Math.min(100, Math.max(0, (score / max) * 100));
10
+ const color = scoreColor(score);
11
+ return (
12
+ <div className="score-bar-container">
13
+ <div className="score-bar">
14
+ <div className="score-bar-fill" style={{ width: `${pct}%`, background: color }} />
15
+ </div>
16
+ <span className="score-label" style={{ color }}>{score.toFixed(4)}</span>
17
+ </div>
18
+ );
19
+ }
frontend/src/components/Select.tsx ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useRef, useEffect } from "react";
2
+
3
+ interface Option {
4
+ value: string;
5
+ label: string;
6
+ }
7
+
8
+ interface Props {
9
+ options: Option[];
10
+ value: string;
11
+ onChange: (value: string) => void;
12
+ placeholder?: string;
13
+ }
14
+
15
+ export default function Select({ options, value, onChange, placeholder }: Props) {
16
+ const [open, setOpen] = useState(false);
17
+ const ref = useRef<HTMLDivElement>(null);
18
+
19
+ useEffect(() => {
20
+ function handleClickOutside(e: MouseEvent) {
21
+ if (ref.current && !ref.current.contains(e.target as Node)) {
22
+ setOpen(false);
23
+ }
24
+ }
25
+ document.addEventListener("mousedown", handleClickOutside);
26
+ return () => document.removeEventListener("mousedown", handleClickOutside);
27
+ }, []);
28
+
29
+ const selected = options.find((o) => o.value === value);
30
+
31
+ return (
32
+ <div className="custom-select" ref={ref}>
33
+ <button
34
+ className="custom-select-trigger"
35
+ onClick={() => setOpen(!open)}
36
+ type="button"
37
+ >
38
+ <span>{selected?.label || placeholder || "Select..."}</span>
39
+ <span className="custom-select-arrow">{open ? "\u25b4" : "\u25be"}</span>
40
+ </button>
41
+ {open && (
42
+ <div className="custom-select-dropdown">
43
+ {options.map((opt) => (
44
+ <button
45
+ key={opt.value}
46
+ className={`custom-select-option ${opt.value === value ? "custom-select-option-active" : ""}`}
47
+ onClick={() => {
48
+ onChange(opt.value);
49
+ setOpen(false);
50
+ }}
51
+ type="button"
52
+ >
53
+ {opt.label}
54
+ </button>
55
+ ))}
56
+ </div>
57
+ )}
58
+ </div>
59
+ );
60
+ }
frontend/src/components/SemanticSearch.tsx ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api } from "../api";
3
+ import type { QueryResultItem } from "../types";
4
+ import { useApiCall } from "../hooks/useApiCall";
5
+ import ScoreBar from "./ScoreBar";
6
+ import StatusMessage from "./StatusMessage";
7
+
8
+ export default function SemanticSearch() {
9
+ const [query, setQuery] = useState("");
10
+ const [topK, setTopK] = useState(10);
11
+ const { data: results, loading, error, run } = useApiCall<QueryResultItem[]>();
12
+
13
+ async function handleSearch() {
14
+ if (!query.trim()) return;
15
+ await run(() => api.query({ text: query, top_k: topK }).then((r) => r.results));
16
+ }
17
+
18
+ return (
19
+ <div>
20
+ <div className="panel">
21
+ <h2>Semantic Search</h2>
22
+ <p className="panel-desc">
23
+ Find passages most semantically similar to your query across the entire corpus.
24
+ </p>
25
+ <div className="form-row">
26
+ <div className="form-group">
27
+ <label>Query</label>
28
+ <input
29
+ value={query}
30
+ onChange={(e) => setQuery(e.target.value)}
31
+ placeholder="e.g. a place where children learn and take tests"
32
+ onKeyDown={(e) => e.key === "Enter" && handleSearch()}
33
+ />
34
+ </div>
35
+ <div className="form-group form-group-sm">
36
+ <label>Top K</label>
37
+ <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
38
+ </div>
39
+ <div className="form-group form-group-sm">
40
+ <label>&nbsp;</label>
41
+ <button className="btn btn-primary" onClick={handleSearch} disabled={loading || !query.trim()}>
42
+ {loading ? "Searching..." : "Search"}
43
+ </button>
44
+ </div>
45
+ </div>
46
+ </div>
47
+
48
+ {error && <StatusMessage type="err" message={error} />}
49
+
50
+ {results && (
51
+ <div className="panel">
52
+ <h3>Results ({results.length})</h3>
53
+ {results.map((r) => (
54
+ <div key={`${r.doc_id}-${r.chunk_index}`} className="result-card">
55
+ <div className="result-header">
56
+ <div>
57
+ <span className="badge">#{r.rank}</span>{" "}
58
+ <span className="badge">{r.doc_id}</span>{" "}
59
+ <span className="tag">chunk {r.chunk_index}</span>
60
+ </div>
61
+ <ScoreBar score={r.score} />
62
+ </div>
63
+ <div className="result-text">{r.text}</div>
64
+ </div>
65
+ ))}
66
+ </div>
67
+ )}
68
+ </div>
69
+ );
70
+ }
frontend/src/components/SimilarWords.tsx ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api } from "../api";
3
+ import { useApiCall } from "../hooks/useApiCall";
4
+ import ScoreBar from "./ScoreBar";
5
+ import StatusMessage from "./StatusMessage";
6
+
7
+ interface SimilarWord {
8
+ word: string;
9
+ score: number;
10
+ }
11
+
12
+ export default function SimilarWords() {
13
+ const [word, setWord] = useState("");
14
+ const [topK, setTopK] = useState(10);
15
+ const { data: results, loading, error, run } = useApiCall<SimilarWord[]>();
16
+
17
+ async function handleSearch() {
18
+ if (!word.trim()) return;
19
+ await run(() => api.similarWords({ word: word.trim(), top_k: topK }).then((r) => r.similar));
20
+ }
21
+
22
+ return (
23
+ <div>
24
+ <div className="panel">
25
+ <h2>Similar Words</h2>
26
+ <p className="panel-desc">
27
+ Find words that appear in similar contexts using transformer embeddings.
28
+ Unlike Word2Vec (static, one vector per word), this uses the model's contextual understanding.
29
+ </p>
30
+ <div className="form-row">
31
+ <div className="form-group">
32
+ <label>Word</label>
33
+ <input
34
+ value={word}
35
+ onChange={(e) => setWord(e.target.value)}
36
+ onKeyDown={(e) => e.key === "Enter" && handleSearch()}
37
+ placeholder="e.g. Epstein, flight, island"
38
+ />
39
+ </div>
40
+ <div className="form-group form-group-sm">
41
+ <label>Top K</label>
42
+ <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
43
+ </div>
44
+ <div className="form-group form-group-sm">
45
+ <label>&nbsp;</label>
46
+ <button className="btn btn-primary" onClick={handleSearch} disabled={loading || !word.trim()}>
47
+ {loading ? "Searching..." : "Find"}
48
+ </button>
49
+ </div>
50
+ </div>
51
+ </div>
52
+
53
+ {error && <StatusMessage type="err" message={error} />}
54
+
55
+ {results && results.length > 0 && (
56
+ <div className="panel">
57
+ <h3>Words similar to "{word}" ({results.length})</h3>
58
+ <table className="data-table">
59
+ <thead>
60
+ <tr><th>Word</th><th>Similarity</th></tr>
61
+ </thead>
62
+ <tbody>
63
+ {results.map((r, i) => (
64
+ <tr key={i}>
65
+ <td style={{ fontWeight: 600 }}>{r.word}</td>
66
+ <td><ScoreBar score={r.score} /></td>
67
+ </tr>
68
+ ))}
69
+ </tbody>
70
+ </table>
71
+ </div>
72
+ )}
73
+ </div>
74
+ );
75
+ }
frontend/src/components/StatusMessage.tsx ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ interface StatusMessageProps {
2
+ type: "ok" | "err" | "loading";
3
+ message: string;
4
+ }
5
+
6
+ export default function StatusMessage({ type, message }: StatusMessageProps) {
7
+ return (
8
+ <div className={`status status-${type}`}>
9
+ {type === "loading" && <span className="spinner" />}
10
+ {message}
11
+ </div>
12
+ );
13
+ }
frontend/src/components/Switch.tsx ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ interface Props {
2
+ checked: boolean;
3
+ onChange: (checked: boolean) => void;
4
+ label?: string;
5
+ }
6
+
7
+ export default function Switch({ checked, onChange, label }: Props) {
8
+ return (
9
+ <label className="switch">
10
+ <button
11
+ className={`switch-track ${checked ? "switch-track-on" : ""}`}
12
+ onClick={() => onChange(!checked)}
13
+ type="button"
14
+ role="switch"
15
+ aria-checked={checked}
16
+ >
17
+ <span className="switch-thumb" />
18
+ </button>
19
+ {label && <span className="switch-label">{label}</span>}
20
+ </label>
21
+ );
22
+ }
frontend/src/components/TextCompare.tsx ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api } from "../api";
3
+ import { useApiCall } from "../hooks/useApiCall";
4
+ import { scoreColor } from "../utils/colors";
5
+ import StatusMessage from "./StatusMessage";
6
+
7
+ export default function TextCompare() {
8
+ const [textA, setTextA] = useState("");
9
+ const [textB, setTextB] = useState("");
10
+ const { data: similarity, loading, error, run } = useApiCall<number>();
11
+
12
+ async function handleCompare() {
13
+ if (!textA.trim() || !textB.trim()) return;
14
+ await run(() => api.compare({ text_a: textA, text_b: textB }).then((r) => r.similarity));
15
+ }
16
+
17
+ return (
18
+ <div>
19
+ <div className="panel">
20
+ <h2>Compare Texts</h2>
21
+ <p className="panel-desc">
22
+ Compute cosine similarity between two sentences/passages.
23
+ </p>
24
+ <div className="grid-2">
25
+ <div className="form-group">
26
+ <label>Text A</label>
27
+ <textarea
28
+ value={textA}
29
+ onChange={(e) => setTextA(e.target.value)}
30
+ placeholder="I love pizza so much"
31
+ rows={4}
32
+ />
33
+ </div>
34
+ <div className="form-group">
35
+ <label>Text B</label>
36
+ <textarea
37
+ value={textB}
38
+ onChange={(e) => setTextB(e.target.value)}
39
+ placeholder="I love school so much"
40
+ rows={4}
41
+ />
42
+ </div>
43
+ </div>
44
+ <div className="mt-2">
45
+ <button
46
+ className="btn btn-primary"
47
+ onClick={handleCompare}
48
+ disabled={loading || !textA.trim() || !textB.trim()}
49
+ >
50
+ {loading ? "Computing..." : "Compare"}
51
+ </button>
52
+ </div>
53
+ </div>
54
+
55
+ {error && <StatusMessage type="err" message={error} />}
56
+
57
+ {similarity !== null && (
58
+ <div className="panel">
59
+ <div className="similarity-gauge">
60
+ <div className="similarity-value" style={{ color: scoreColor(similarity) }}>
61
+ {similarity.toFixed(4)}
62
+ </div>
63
+ <div className="similarity-label">Cosine Similarity</div>
64
+ <div style={{ width: "100%", maxWidth: 400, marginTop: 16 }}>
65
+ <div className="score-bar" style={{ width: "100%", height: 12 }}>
66
+ <div
67
+ className="score-bar-fill"
68
+ style={{
69
+ width: `${Math.max(0, similarity) * 100}%`,
70
+ background: scoreColor(similarity),
71
+ }}
72
+ />
73
+ </div>
74
+ <div className="score-bar-legend">
75
+ <span>0 (unrelated)</span>
76
+ <span>1 (identical)</span>
77
+ </div>
78
+ </div>
79
+ </div>
80
+ </div>
81
+ )}
82
+ </div>
83
+ );
84
+ }
frontend/src/components/Toggle.tsx ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ interface Option {
2
+ value: string;
3
+ label: string;
4
+ }
5
+
6
+ interface Props {
7
+ options: Option[];
8
+ value: string;
9
+ onChange: (value: string) => void;
10
+ }
11
+
12
+ export default function Toggle({ options, value, onChange }: Props) {
13
+ return (
14
+ <div className="toggle">
15
+ {options.map((opt) => (
16
+ <button
17
+ key={opt.value}
18
+ className={`toggle-option ${opt.value === value ? "toggle-option-active" : ""}`}
19
+ onClick={() => onChange(opt.value)}
20
+ type="button"
21
+ >
22
+ {opt.label}
23
+ </button>
24
+ ))}
25
+ </div>
26
+ );
27
+ }
frontend/src/components/TrainingPanel.tsx ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+ import type { TrainResponse, QueryResultItem, CompareResponse } from "../types";
4
+ import { useCorpusLoader } from "../hooks/useCorpusLoader";
5
+ import { scoreColor } from "../utils/colors";
6
+ import ScoreBar from "./ScoreBar";
7
+ import StatusMessage from "./StatusMessage";
8
+ import MetricCard from "./MetricCard";
9
+ import Toggle from "./Toggle";
10
+ import Select from "./Select";
11
+ import LogViewer from "./LogViewer";
12
+
13
+ type Strategy = "unsupervised" | "contrastive" | "keywords";
14
+
15
+ interface SimilarWord {
16
+ word: string;
17
+ score: number;
18
+ }
19
+
20
+ const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
21
+ { id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
22
+ { id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
23
+ { id: "keywords", label: "Keyword-supervised", desc: "You provide keyword→meaning map. Best if you know the code words." },
24
+ ];
25
+
26
+ const MODELS = [
27
+ { value: "all-MiniLM-L6-v2", label: "all-MiniLM-L6-v2 (fast)" },
28
+ { value: "all-mpnet-base-v2", label: "all-mpnet-base-v2 (best quality)" },
29
+ ];
30
+
31
+ export default function TrainingPanel() {
32
+ // Training
33
+ const [strategy, setStrategy] = useState<Strategy>("contrastive");
34
+ const [baseModel, setBaseModel] = useState("all-MiniLM-L6-v2");
35
+ const [outputPath, setOutputPath] = useState("./trained_model");
36
+ const [epochs, setEpochs] = useState(5);
37
+ const [batchSize, setBatchSize] = useState(16);
38
+ const [keywordMapText, setKeywordMapText] = useState('{\n "pizza": "school",\n "pepperoni": "math class"\n}');
39
+ const [showAdvanced, setShowAdvanced] = useState(false);
40
+ const [training, setTraining] = useState(false);
41
+ const [result, setResult] = useState<TrainResponse | null>(null);
42
+
43
+ const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
44
+
45
+ // Similar words
46
+ const [simWord, setSimWord] = useState("");
47
+ const [simTopK, setSimTopK] = useState(10);
48
+ const [simResults, setSimResults] = useState<SimilarWord[]>([]);
49
+ const [simLoading, setSimLoading] = useState(false);
50
+
51
+ // Compare
52
+ const [compTextA, setCompTextA] = useState("");
53
+ const [compTextB, setCompTextB] = useState("");
54
+ const [compResult, setCompResult] = useState<CompareResponse | null>(null);
55
+ const [compLoading, setCompLoading] = useState(false);
56
+
57
+ // Search
58
+ const [queryText, setQueryText] = useState("");
59
+ const [queryTopK, setQueryTopK] = useState(5);
60
+ const [queryResults, setQueryResults] = useState<QueryResultItem[]>([]);
61
+ const [queryLoading, setQueryLoading] = useState(false);
62
+
63
+ const ready = result !== null;
64
+
65
+ async function handleTrain() {
66
+ setTraining(true); setError(""); setResult(null);
67
+ try {
68
+ const corpus = parseCorpus();
69
+ if (!corpus.length) { setError("Corpus is empty."); setTraining(false); return; }
70
+
71
+ const base = { corpus_texts: corpus, base_model: baseModel, output_path: outputPath, epochs, batch_size: batchSize };
72
+ let res: TrainResponse;
73
+
74
+ if (strategy === "unsupervised") {
75
+ res = await api.trainUnsupervised(base);
76
+ } else if (strategy === "contrastive") {
77
+ res = await api.trainContrastive(base);
78
+ } else {
79
+ const kw = JSON.parse(keywordMapText);
80
+ res = await api.trainKeywords({ ...base, keyword_meanings: kw });
81
+ }
82
+ setResult(res);
83
+ } catch (e) {
84
+ setError(e instanceof SyntaxError ? "Invalid JSON in keyword map." : getErrorMessage(e));
85
+ } finally {
86
+ setTraining(false);
87
+ }
88
+ }
89
+
90
+ async function handleSimilarWords() {
91
+ setSimLoading(true); setError("");
92
+ try {
93
+ const res = await api.similarWords({ word: simWord, top_k: simTopK });
94
+ setSimResults(res.similar);
95
+ } catch (err) {
96
+ setError(getErrorMessage(err));
97
+ } finally {
98
+ setSimLoading(false);
99
+ }
100
+ }
101
+
102
+ async function handleCompare() {
103
+ setCompLoading(true); setError("");
104
+ try {
105
+ const res = await api.compare({ text_a: compTextA, text_b: compTextB });
106
+ setCompResult(res);
107
+ } catch (err) {
108
+ setError(getErrorMessage(err));
109
+ } finally {
110
+ setCompLoading(false);
111
+ }
112
+ }
113
+
114
+ async function handleQuery() {
115
+ setQueryLoading(true); setError("");
116
+ try {
117
+ const res = await api.query({ text: queryText, top_k: queryTopK });
118
+ setQueryResults(res.results);
119
+ } catch (err) {
120
+ setError(getErrorMessage(err));
121
+ } finally {
122
+ setQueryLoading(false);
123
+ }
124
+ }
125
+
126
+ return (
127
+ <div>
128
+ {/* 1. Training (strategy + config + corpus merged) */}
129
+ <div className="panel">
130
+ <h2>1. Fine-tune Transformer</h2>
131
+ <p className="panel-desc">
132
+ Fine-tune a pre-trained sentence transformer on your corpus to improve contextual understanding.
133
+ </p>
134
+
135
+ <div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
136
+ <button className="btn btn-secondary" onClick={loadFromEngine}
137
+ disabled={corpusLoading}>
138
+ {corpusLoading ? "Loading..." : "Load from Engine"}
139
+ </button>
140
+ {corpusText && (
141
+ <button className="btn btn-secondary" onClick={() => setCorpusText("")}>
142
+ Clear
143
+ </button>
144
+ )}
145
+ </div>
146
+ <div className="form-group" style={{ marginBottom: 12 }}>
147
+ <label>
148
+ Corpus (separate documents with blank lines)
149
+ {corpusText && (
150
+ <span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
151
+ {" "} — {parseCorpus().length} documents detected
152
+ </span>
153
+ )}
154
+ </label>
155
+ <textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
156
+ placeholder="Document 1 text...\n\nDocument 2 text..." />
157
+ </div>
158
+
159
+ <label className="section-label">Strategy</label>
160
+ <Toggle
161
+ options={STRATEGIES.map(s => ({ value: s.id, label: s.label }))}
162
+ value={strategy}
163
+ onChange={(v) => setStrategy(v as Strategy)}
164
+ />
165
+ <p style={{ color: "var(--text-dim)", fontSize: "0.85rem", marginBottom: 12 }}>
166
+ {STRATEGIES.find(s => s.id === strategy)?.desc}
167
+ </p>
168
+
169
+ {strategy === "keywords" && (
170
+ <div className="form-group" style={{ marginBottom: 12 }}>
171
+ <label>Keyword → Meaning Map (JSON)</label>
172
+ <textarea value={keywordMapText} onChange={e => setKeywordMapText(e.target.value)}
173
+ rows={4} style={{ fontFamily: "monospace", fontSize: "0.8rem" }} />
174
+ </div>
175
+ )}
176
+
177
+ <div className="form-row" style={{ marginBottom: 12 }}>
178
+ <div className="form-group">
179
+ <label>Base Model</label>
180
+ <Select options={MODELS} value={baseModel} onChange={setBaseModel} />
181
+ </div>
182
+ </div>
183
+
184
+ <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
185
+ {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
186
+ </button>
187
+
188
+ {showAdvanced && (
189
+ <div className="advanced-section">
190
+ <div className="form-row">
191
+ <div className="form-group" style={{ maxWidth: 100 }}>
192
+ <label>Epochs</label>
193
+ <input type="number" value={epochs} onChange={e => setEpochs(+e.target.value)} min={1} max={50} />
194
+ </div>
195
+ <div className="form-group" style={{ maxWidth: 120 }}>
196
+ <label>Batch Size</label>
197
+ <input type="number" value={batchSize} onChange={e => setBatchSize(+e.target.value)} min={4} max={128} />
198
+ </div>
199
+ <div className="form-group" style={{ maxWidth: 200 }}>
200
+ <label>Output Path</label>
201
+ <input value={outputPath} onChange={e => setOutputPath(e.target.value)} />
202
+ </div>
203
+ </div>
204
+ </div>
205
+ )}
206
+
207
+ <button className="btn btn-primary" onClick={handleTrain}
208
+ disabled={training || !corpusText.trim()} style={{ marginTop: 8 }}>
209
+ {training ? <><span className="spinner" /> Training...</> : "Start Training"}
210
+ </button>
211
+
212
+ <LogViewer active={training} />
213
+ </div>
214
+
215
+ {error && <StatusMessage type="err" message={error} />}
216
+
217
+ {result && (
218
+ <div className="panel">
219
+ <h2>Training Complete</h2>
220
+ <div className="metric-grid">
221
+ <MetricCard value={result.training_pairs} label="Training Pairs" />
222
+ <MetricCard value={result.epochs} label="Epochs" />
223
+ <MetricCard value={`${result.seconds}s`} label="Time" />
224
+ </div>
225
+ <StatusMessage type="ok"
226
+ message={`Model saved: ${result.model_path} — use this path in the Setup tab.`} />
227
+ </div>
228
+ )}
229
+
230
+ {/* 2. Similar Words */}
231
+ <div className="panel">
232
+ <h2>2. Similar Words</h2>
233
+ <p className="panel-desc">
234
+ Find words that appear in similar contexts using transformer embeddings.
235
+ </p>
236
+ <div className="form-row">
237
+ <div className="form-group">
238
+ <label>Word</label>
239
+ <input value={simWord} onChange={e => setSimWord(e.target.value)}
240
+ onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
241
+ placeholder="e.g. pizza" />
242
+ </div>
243
+ <div className="form-group form-group-sm">
244
+ <label>Top K</label>
245
+ <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
246
+ </div>
247
+ <div className="form-group form-group-sm">
248
+ <label>&nbsp;</label>
249
+ <button className="btn btn-primary" onClick={handleSimilarWords}
250
+ disabled={simLoading || !simWord.trim()}>
251
+ {simLoading ? "Searching..." : "Find"}
252
+ </button>
253
+ </div>
254
+ </div>
255
+
256
+ {simResults.length > 0 && (
257
+ <table className="data-table" style={{ marginTop: 12 }}>
258
+ <thead>
259
+ <tr><th>Word</th><th>Similarity</th></tr>
260
+ </thead>
261
+ <tbody>
262
+ {simResults.map((r, i) => (
263
+ <tr key={i}>
264
+ <td style={{ fontWeight: 600 }}>{r.word}</td>
265
+ <td><ScoreBar score={r.score} /></td>
266
+ </tr>
267
+ ))}
268
+ </tbody>
269
+ </table>
270
+ )}
271
+ </div>
272
+
273
+ {/* 3. Compare Texts */}
274
+ <div className="panel">
275
+ <h2>3. Compare Texts</h2>
276
+ <p className="panel-desc">
277
+ Sentence similarity via transformer contextual embeddings.
278
+ </p>
279
+ <div className="form-row">
280
+ <div className="form-group">
281
+ <label>Text A</label>
282
+ <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
283
+ placeholder="pizza gives me homework" />
284
+ </div>
285
+ <div className="form-group">
286
+ <label>Text B</label>
287
+ <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
288
+ placeholder="school gives me homework" />
289
+ </div>
290
+ </div>
291
+ <button className="btn btn-primary" onClick={handleCompare}
292
+ disabled={compLoading || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
293
+ {compLoading ? "Comparing..." : "Compare"}
294
+ </button>
295
+
296
+ {compResult && (
297
+ <div className="similarity-gauge" style={{ marginTop: 16 }}>
298
+ <div className="similarity-value"
299
+ style={{ color: scoreColor(compResult.similarity) }}>
300
+ {compResult.similarity.toFixed(4)}
301
+ </div>
302
+ <div className="similarity-label">Transformer Cosine Similarity</div>
303
+ </div>
304
+ )}
305
+ </div>
306
+
307
+ {/* 4. Semantic Search */}
308
+ <div className="panel">
309
+ <h2>4. Semantic Search</h2>
310
+ <p className="panel-desc">
311
+ Search your corpus using transformer embeddings.
312
+ </p>
313
+ <div className="form-row">
314
+ <div className="form-group">
315
+ <label>Query</label>
316
+ <input value={queryText} onChange={e => setQueryText(e.target.value)}
317
+ onKeyDown={e => e.key === "Enter" && handleQuery()}
318
+ placeholder="a place where children learn" />
319
+ </div>
320
+ <div className="form-group form-group-sm">
321
+ <label>Top K</label>
322
+ <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
323
+ </div>
324
+ <div className="form-group form-group-sm">
325
+ <label>&nbsp;</label>
326
+ <button className="btn btn-primary" onClick={handleQuery}
327
+ disabled={queryLoading || !queryText.trim()}>
328
+ {queryLoading ? "Searching..." : "Search"}
329
+ </button>
330
+ </div>
331
+ </div>
332
+
333
+ {queryResults.length > 0 && (
334
+ <div style={{ marginTop: 12 }}>
335
+ {queryResults.map((r, i) => (
336
+ <div key={i} className="result-card">
337
+ <div className="result-header">
338
+ <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
339
+ <ScoreBar score={r.score} />
340
+ </div>
341
+ <div className="result-text">{r.text}</div>
342
+ </div>
343
+ ))}
344
+ </div>
345
+ )}
346
+ </div>
347
+ </div>
348
+ );
349
+ }
frontend/src/components/Word2VecPanel.tsx ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+ import type { W2VInitResponse, W2VQueryResult, W2VSimilarWord, CompareResponse } from "../types";
4
+ import { useCorpusLoader } from "../hooks/useCorpusLoader";
5
+ import { scoreColor } from "../utils/colors";
6
+ import ScoreBar from "./ScoreBar";
7
+ import StatusMessage from "./StatusMessage";
8
+ import LogViewer from "./LogViewer";
9
+ import MetricCard from "./MetricCard";
10
+
11
+ export default function Word2VecPanel() {
12
+ // Init
13
+ const [vectorSize, setVectorSize] = useState(100);
14
+ const [windowSize, setWindowSize] = useState(5);
15
+ const [w2vEpochs, setW2vEpochs] = useState(50);
16
+ const [showAdvanced, setShowAdvanced] = useState(false);
17
+ const [initLoading, setInitLoading] = useState(false);
18
+ const [initResult, setInitResult] = useState<W2VInitResponse | null>(null);
19
+
20
+ const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
21
+
22
+ // Similar words
23
+ const [simWord, setSimWord] = useState("");
24
+ const [simTopK, setSimTopK] = useState(10);
25
+ const [simResults, setSimResults] = useState<W2VSimilarWord[]>([]);
26
+ const [simLoading, setSimLoading] = useState(false);
27
+
28
+ // Compare
29
+ const [compTextA, setCompTextA] = useState("");
30
+ const [compTextB, setCompTextB] = useState("");
31
+ const [compResult, setCompResult] = useState<CompareResponse | null>(null);
32
+ const [compLoading, setCompLoading] = useState(false);
33
+
34
+ // Query
35
+ const [queryText, setQueryText] = useState("");
36
+ const [queryTopK, setQueryTopK] = useState(5);
37
+ const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
38
+ const [queryLoading, setQueryLoading] = useState(false);
39
+
40
+ async function handleInit() {
41
+ setInitLoading(true); setError(""); setInitResult(null);
42
+ try {
43
+ const corpus = parseCorpus();
44
+ if (!corpus.length) { setError("Corpus is empty."); setInitLoading(false); return; }
45
+ const res = await api.w2vInit({
46
+ corpus_texts: corpus,
47
+ vector_size: vectorSize,
48
+ window: windowSize,
49
+ epochs: w2vEpochs,
50
+ });
51
+ setInitResult(res);
52
+ } catch (err) {
53
+ setError(getErrorMessage(err));
54
+ } finally {
55
+ setInitLoading(false);
56
+ }
57
+ }
58
+
59
+ async function handleSimilarWords() {
60
+ setSimLoading(true); setError("");
61
+ try {
62
+ const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
63
+ setSimResults(res.similar);
64
+ } catch (err) {
65
+ setError(getErrorMessage(err));
66
+ } finally {
67
+ setSimLoading(false);
68
+ }
69
+ }
70
+
71
+ async function handleCompare() {
72
+ setCompLoading(true); setError("");
73
+ try {
74
+ const res = await api.w2vCompare({ text_a: compTextA, text_b: compTextB });
75
+ setCompResult(res);
76
+ } catch (err) {
77
+ setError(getErrorMessage(err));
78
+ } finally {
79
+ setCompLoading(false);
80
+ }
81
+ }
82
+
83
+ async function handleQuery() {
84
+ setQueryLoading(true); setError("");
85
+ try {
86
+ const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
87
+ setQueryResults(res.results);
88
+ } catch (err) {
89
+ setError(getErrorMessage(err));
90
+ } finally {
91
+ setQueryLoading(false);
92
+ }
93
+ }
94
+
95
+ const ready = initResult !== null;
96
+
97
+ return (
98
+ <div>
99
+ {/* 1. Training */}
100
+ <div className="panel">
101
+ <h2>1. Train Word2Vec (gensim)</h2>
102
+ <p className="panel-desc">
103
+ Static embeddings — one vector per word, no context awareness.
104
+ Useful as a baseline to compare against the transformer approach.
105
+ </p>
106
+ <div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
107
+ <button className="btn btn-secondary" onClick={loadFromEngine}
108
+ disabled={corpusLoading}>
109
+ {corpusLoading ? "Loading..." : "Load from Engine"}
110
+ </button>
111
+ {corpusText && (
112
+ <button className="btn btn-secondary" onClick={() => setCorpusText("")}>
113
+ Clear
114
+ </button>
115
+ )}
116
+ </div>
117
+ <div className="form-group" style={{ marginBottom: 12 }}>
118
+ <label>
119
+ Corpus (separate documents with blank lines)
120
+ {corpusText && (
121
+ <span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
122
+ {" "} — {parseCorpus().length} documents detected
123
+ </span>
124
+ )}
125
+ </label>
126
+ <textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
127
+ placeholder="Document 1 text...\n\nDocument 2 text..." />
128
+ </div>
129
+ <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
130
+ {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
131
+ </button>
132
+
133
+ {showAdvanced && (
134
+ <div className="advanced-section">
135
+ <div className="form-row">
136
+ <div className="form-group" style={{ maxWidth: 120 }}>
137
+ <label>Vector Size</label>
138
+ <input type="number" value={vectorSize} onChange={e => setVectorSize(+e.target.value)} min={50} max={300} />
139
+ </div>
140
+ <div className="form-group" style={{ maxWidth: 120 }}>
141
+ <label>Window</label>
142
+ <input type="number" value={windowSize} onChange={e => setWindowSize(+e.target.value)} min={2} max={15} />
143
+ </div>
144
+ <div className="form-group" style={{ maxWidth: 120 }}>
145
+ <label>Epochs</label>
146
+ <input type="number" value={w2vEpochs} onChange={e => setW2vEpochs(+e.target.value)} min={5} max={200} />
147
+ </div>
148
+ </div>
149
+ </div>
150
+ )}
151
+
152
+ <button className="btn btn-primary" onClick={handleInit}
153
+ disabled={initLoading || !corpusText.trim()} style={{ marginTop: 8 }}>
154
+ {initLoading ? <><span className="spinner" /> Training...</> : "Train Word2Vec"}
155
+ </button>
156
+
157
+ <LogViewer active={initLoading} />
158
+ </div>
159
+
160
+ {error && <StatusMessage type="err" message={error} />}
161
+
162
+ {initResult && (
163
+ <div className="panel">
164
+ <h2>Word2Vec Ready</h2>
165
+ <div className="metric-grid">
166
+ <MetricCard value={initResult.vocab_size} label="Vocabulary" />
167
+ <MetricCard value={initResult.sentences} label="Sentences" />
168
+ <MetricCard value={initResult.vector_size} label="Dimensions" />
169
+ <MetricCard value={`${initResult.seconds}s`} label="Time" />
170
+ </div>
171
+ </div>
172
+ )}
173
+
174
+ {/* 2. Similar Words */}
175
+ <div className="panel">
176
+ <h2>2. Similar Words</h2>
177
+ <p className="panel-desc">
178
+ Find words that appear in similar contexts using Word2Vec static embeddings.
179
+ </p>
180
+ <div className="form-row">
181
+ <div className="form-group">
182
+ <label>Word</label>
183
+ <input value={simWord} onChange={e => setSimWord(e.target.value)}
184
+ onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
185
+ placeholder="e.g. pizza" />
186
+ </div>
187
+ <div className="form-group form-group-sm">
188
+ <label>Top K</label>
189
+ <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
190
+ </div>
191
+ <div className="form-group form-group-sm">
192
+ <label>&nbsp;</label>
193
+ <button className="btn btn-primary" onClick={handleSimilarWords}
194
+ disabled={simLoading || !ready || !simWord.trim()}>
195
+ {simLoading ? "Searching..." : "Find"}
196
+ </button>
197
+ </div>
198
+ </div>
199
+
200
+ {simResults.length > 0 && (
201
+ <table className="data-table" style={{ marginTop: 12 }}>
202
+ <thead>
203
+ <tr><th>Word</th><th>Similarity</th></tr>
204
+ </thead>
205
+ <tbody>
206
+ {simResults.map((r, i) => (
207
+ <tr key={i}>
208
+ <td style={{ fontWeight: 600 }}>{r.word}</td>
209
+ <td><ScoreBar score={r.score} /></td>
210
+ </tr>
211
+ ))}
212
+ </tbody>
213
+ </table>
214
+ )}
215
+ </div>
216
+
217
+ {/* 3. Compare Texts */}
218
+ <div className="panel">
219
+ <h2>3. Compare Texts</h2>
220
+ <p className="panel-desc">
221
+ Sentence similarity via averaged word vectors.
222
+ </p>
223
+ <div className="form-row">
224
+ <div className="form-group">
225
+ <label>Text A</label>
226
+ <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
227
+ placeholder="pizza gives me homework" />
228
+ </div>
229
+ <div className="form-group">
230
+ <label>Text B</label>
231
+ <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
232
+ placeholder="school gives me homework" />
233
+ </div>
234
+ </div>
235
+ <button className="btn btn-primary" onClick={handleCompare}
236
+ disabled={compLoading || !ready || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
237
+ {compLoading ? "Comparing..." : "Compare"}
238
+ </button>
239
+
240
+ {compResult && (
241
+ <div className="similarity-gauge" style={{ marginTop: 16 }}>
242
+ <div className="similarity-value"
243
+ style={{ color: scoreColor(compResult.similarity) }}>
244
+ {compResult.similarity.toFixed(4)}
245
+ </div>
246
+ <div className="similarity-label">Word2Vec Cosine Similarity</div>
247
+ </div>
248
+ )}
249
+ </div>
250
+
251
+ {/* 4. Semantic Search */}
252
+ <div className="panel">
253
+ <h2>4. Semantic Search</h2>
254
+ <p className="panel-desc">
255
+ Search your corpus using averaged Word2Vec vectors.
256
+ </p>
257
+ <div className="form-row">
258
+ <div className="form-group">
259
+ <label>Query</label>
260
+ <input value={queryText} onChange={e => setQueryText(e.target.value)}
261
+ onKeyDown={e => e.key === "Enter" && handleQuery()}
262
+ placeholder="a place where children learn" />
263
+ </div>
264
+ <div className="form-group form-group-sm">
265
+ <label>Top K</label>
266
+ <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
267
+ </div>
268
+ <div className="form-group form-group-sm">
269
+ <label>&nbsp;</label>
270
+ <button className="btn btn-primary" onClick={handleQuery}
271
+ disabled={queryLoading || !ready || !queryText.trim()}>
272
+ {queryLoading ? "Searching..." : "Search"}
273
+ </button>
274
+ </div>
275
+ </div>
276
+
277
+ {queryResults.length > 0 && (
278
+ <div style={{ marginTop: 12 }}>
279
+ {queryResults.map((r, i) => (
280
+ <div key={i} className="result-card">
281
+ <div className="result-header">
282
+ <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
283
+ <ScoreBar score={r.score} />
284
+ </div>
285
+ <div className="result-text">{r.text}</div>
286
+ </div>
287
+ ))}
288
+ </div>
289
+ )}
290
+ </div>
291
+ </div>
292
+ );
293
+ }
frontend/src/hooks/useApiCall.ts ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useCallback } from "react";
2
+ import { getErrorMessage } from "../api";
3
+
4
+ /**
5
+ * Generic hook for API calls with loading/error/result state.
6
+ * Eliminates the repeated try/catch/setLoading/setError pattern.
7
+ */
8
+ export function useApiCall<T>() {
9
+ const [data, setData] = useState<T | null>(null);
10
+ const [loading, setLoading] = useState(false);
11
+ const [error, setError] = useState("");
12
+
13
+ const run = useCallback(async (fn: () => Promise<T>): Promise<T | null> => {
14
+ setLoading(true);
15
+ setError("");
16
+ try {
17
+ const result = await fn();
18
+ setData(result);
19
+ return result;
20
+ } catch (err) {
21
+ setError(getErrorMessage(err));
22
+ return null;
23
+ } finally {
24
+ setLoading(false);
25
+ }
26
+ }, []);
27
+
28
+ const clear = useCallback(() => {
29
+ setData(null);
30
+ setError("");
31
+ }, []);
32
+
33
+ return { data, loading, error, setError, run, clear };
34
+ }
frontend/src/hooks/useCorpusLoader.ts ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import { api, getErrorMessage } from "../api";
3
+
4
+ /**
5
+ * Shared hook for loading corpus text from the engine and parsing it into documents.
6
+ * Used by both TrainingPanel and Word2VecPanel.
7
+ */
8
+ export function useCorpusLoader() {
9
+ const [corpusText, setCorpusText] = useState("");
10
+ const [loading, setLoading] = useState(false);
11
+ const [error, setError] = useState("");
12
+
13
+ function parseCorpus(): string[] {
14
+ return corpusText
15
+ .split(/\n{2,}/)
16
+ .map((t) => t.trim())
17
+ .filter((t) => t.length > 20);
18
+ }
19
+
20
+ async function loadFromEngine() {
21
+ setLoading(true);
22
+ setError("");
23
+ try {
24
+ const res = await api.getCorpusTexts();
25
+ if (res.documents.length === 0) {
26
+ setError("No documents loaded in the engine. Load a dataset first.");
27
+ return;
28
+ }
29
+ setCorpusText(
30
+ res.documents.map((d: { doc_id: string; text: string }) => d.text).join("\n\n")
31
+ );
32
+ } catch (e) {
33
+ setError(getErrorMessage(e));
34
+ } finally {
35
+ setLoading(false);
36
+ }
37
+ }
38
+
39
+ return {
40
+ corpusText,
41
+ setCorpusText,
42
+ loading,
43
+ error,
44
+ setError,
45
+ parseCorpus,
46
+ loadFromEngine,
47
+ };
48
+ }
frontend/src/main.tsx ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import App from "./App";
4
+
5
+ createRoot(document.getElementById("root")!).render(
6
+ <StrictMode>
7
+ <App />
8
+ </StrictMode>
9
+ );
frontend/src/styles.css ADDED
@@ -0,0 +1,828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ---- Reset & Base ---- */
2
+ *,
3
+ *::before,
4
+ *::after {
5
+ box-sizing: border-box;
6
+ margin: 0;
7
+ padding: 0;
8
+ }
9
+
10
+ :root {
11
+ --bg: #0f1117;
12
+ --surface: #1a1d27;
13
+ --surface2: #232733;
14
+ --border: #2e3340;
15
+ --text: #e1e4eb;
16
+ --text-dim: #8b90a0;
17
+ --accent: #6c8cff;
18
+ --accent-dim: #4a64cc;
19
+ --ok: #4ade80;
20
+ --warn: #facc15;
21
+ --err: #f87171;
22
+ --radius: 8px;
23
+ }
24
+
25
+ body {
26
+ font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
27
+ background: var(--bg);
28
+ color: var(--text);
29
+ line-height: 1.6;
30
+ }
31
+
32
+ /* ---- App Layout ---- */
33
+ .app {
34
+ max-width: 1200px;
35
+ margin: 0 auto;
36
+ padding: 24px;
37
+ }
38
+
39
+ .app-header {
40
+ display: flex;
41
+ justify-content: space-between;
42
+ align-items: center;
43
+ flex-wrap: wrap;
44
+ gap: 12px;
45
+ margin-bottom: 24px;
46
+ }
47
+
48
+ .app-header h1 {
49
+ font-size: 1.5rem;
50
+ font-weight: 700;
51
+ color: var(--accent);
52
+ }
53
+
54
+ .header-stats {
55
+ display: flex;
56
+ gap: 8px;
57
+ flex-wrap: wrap;
58
+ }
59
+
60
+ /* ---- Badges ---- */
61
+ .badge {
62
+ padding: 4px 10px;
63
+ border-radius: 12px;
64
+ font-size: 0.75rem;
65
+ font-weight: 600;
66
+ background: var(--surface2);
67
+ color: var(--text-dim);
68
+ }
69
+ .badge-ok {
70
+ background: #1a3a2a;
71
+ color: var(--ok);
72
+ }
73
+ .badge-warn {
74
+ background: #3a3520;
75
+ color: var(--warn);
76
+ }
77
+
78
+ /* ---- Progress Stepper ---- */
79
+ .stepper {
80
+ display: flex;
81
+ align-items: flex-start;
82
+ justify-content: center;
83
+ margin-bottom: 28px;
84
+ padding: 0 24px;
85
+ }
86
+
87
+ .stepper-item {
88
+ display: flex;
89
+ flex-direction: column;
90
+ align-items: center;
91
+ gap: 6px;
92
+ position: relative;
93
+ z-index: 1;
94
+ }
95
+
96
+ .stepper-line {
97
+ flex: 1;
98
+ height: 2px;
99
+ background: var(--border);
100
+ margin-top: 16px;
101
+ min-width: 40px;
102
+ }
103
+
104
+ .stepper-line-active {
105
+ background: var(--accent-dim);
106
+ }
107
+
108
+ .stepper-circle {
109
+ width: 34px;
110
+ height: 34px;
111
+ border-radius: 50%;
112
+ border: 2px solid var(--border);
113
+ background: var(--surface);
114
+ color: var(--text-dim);
115
+ font-weight: 700;
116
+ font-size: 0.85rem;
117
+ cursor: pointer;
118
+ display: flex;
119
+ align-items: center;
120
+ justify-content: center;
121
+ transition: all 0.2s;
122
+ }
123
+
124
+ .stepper-circle:hover:not(:disabled) {
125
+ border-color: var(--accent);
126
+ color: var(--accent);
127
+ }
128
+
129
+ .stepper-circle.stepper-active {
130
+ border-color: var(--accent);
131
+ background: var(--accent);
132
+ color: #fff;
133
+ }
134
+
135
+ .stepper-circle.stepper-done {
136
+ border-color: var(--ok);
137
+ background: #1a3a2a;
138
+ color: var(--ok);
139
+ }
140
+
141
+ .stepper-circle:disabled {
142
+ opacity: 0.35;
143
+ cursor: not-allowed;
144
+ }
145
+
146
+ .stepper-label {
147
+ font-size: 0.75rem;
148
+ color: var(--text-dim);
149
+ white-space: nowrap;
150
+ font-weight: 500;
151
+ }
152
+
153
+ .stepper-label-active {
154
+ color: var(--accent);
155
+ font-weight: 600;
156
+ }
157
+
158
+ /* ---- Sub-tabs ---- */
159
+ .subtabs {
160
+ display: flex;
161
+ gap: 2px;
162
+ background: var(--surface);
163
+ border: 1px solid var(--border);
164
+ border-radius: var(--radius);
165
+ padding: 3px;
166
+ margin-bottom: 20px;
167
+ overflow-x: auto;
168
+ }
169
+
170
+ .subtab {
171
+ padding: 7px 16px;
172
+ background: none;
173
+ border: none;
174
+ border-radius: 6px;
175
+ color: var(--text-dim);
176
+ cursor: pointer;
177
+ font-size: 0.8rem;
178
+ font-weight: 500;
179
+ white-space: nowrap;
180
+ transition: all 0.15s;
181
+ }
182
+
183
+ .subtab:hover {
184
+ color: var(--text);
185
+ background: var(--surface2);
186
+ }
187
+
188
+ .subtab-active {
189
+ color: #fff;
190
+ background: var(--accent);
191
+ font-weight: 600;
192
+ }
193
+
194
+ /* ---- Collapsible Toggle ---- */
195
+ .collapsible-toggle {
196
+ display: flex;
197
+ align-items: center;
198
+ gap: 8px;
199
+ width: 100%;
200
+ padding: 14px 16px;
201
+ margin: 16px 0;
202
+ background: var(--surface);
203
+ border: 1px solid var(--border);
204
+ border-radius: var(--radius);
205
+ color: var(--text-dim);
206
+ font-size: 0.85rem;
207
+ font-weight: 500;
208
+ cursor: pointer;
209
+ transition: color 0.15s, border-color 0.15s;
210
+ }
211
+
212
+ .collapsible-toggle:hover {
213
+ color: var(--text);
214
+ border-color: var(--accent-dim);
215
+ }
216
+
217
+ .collapsible-arrow {
218
+ font-size: 0.75rem;
219
+ }
220
+
221
+ /* ---- Advanced Settings Toggle ---- */
222
+ .advanced-toggle {
223
+ display: flex;
224
+ align-items: center;
225
+ gap: 6px;
226
+ padding: 0;
227
+ margin: 12px 0 0;
228
+ background: none;
229
+ border: none;
230
+ color: var(--text-dim);
231
+ font-size: 0.8rem;
232
+ font-weight: 500;
233
+ cursor: pointer;
234
+ transition: color 0.15s;
235
+ }
236
+
237
+ .advanced-toggle:hover {
238
+ color: var(--accent);
239
+ }
240
+
241
+ .advanced-section {
242
+ padding-top: 8px;
243
+ margin-bottom: 12px;
244
+ }
245
+
246
+ /* ---- Toggle (segmented control) ---- */
247
+ .toggle {
248
+ display: inline-flex;
249
+ gap: 2px;
250
+ background: var(--bg);
251
+ border: 1px solid var(--border);
252
+ border-radius: var(--radius);
253
+ padding: 3px;
254
+ }
255
+
256
+ .toggle-option {
257
+ padding: 6px 14px;
258
+ background: none;
259
+ border: none;
260
+ border-radius: 6px;
261
+ color: var(--text-dim);
262
+ font-size: 0.8rem;
263
+ font-weight: 500;
264
+ cursor: pointer;
265
+ transition: all 0.15s;
266
+ white-space: nowrap;
267
+ }
268
+
269
+ .toggle-option:hover {
270
+ color: var(--text);
271
+ }
272
+
273
+ .toggle-option-active {
274
+ background: var(--accent);
275
+ color: #fff;
276
+ font-weight: 600;
277
+ }
278
+
279
+ /* ---- Switch (on/off) ---- */
280
+ .switch {
281
+ display: inline-flex;
282
+ align-items: center;
283
+ gap: 8px;
284
+ cursor: pointer;
285
+ }
286
+
287
+ .switch-track {
288
+ position: relative;
289
+ width: 40px;
290
+ height: 22px;
291
+ border-radius: 11px;
292
+ background: var(--border);
293
+ border: none;
294
+ cursor: pointer;
295
+ padding: 0;
296
+ transition: background 0.2s;
297
+ }
298
+
299
+ .switch-track-on {
300
+ background: var(--accent);
301
+ }
302
+
303
+ .switch-thumb {
304
+ position: absolute;
305
+ top: 2px;
306
+ left: 2px;
307
+ width: 18px;
308
+ height: 18px;
309
+ border-radius: 50%;
310
+ background: #fff;
311
+ transition: transform 0.2s;
312
+ }
313
+
314
+ .switch-track-on .switch-thumb {
315
+ transform: translateX(18px);
316
+ }
317
+
318
+ .switch-label {
319
+ font-size: 0.8rem;
320
+ color: var(--text-dim);
321
+ font-weight: 500;
322
+ user-select: none;
323
+ }
324
+
325
+ /* ---- Custom Select ---- */
326
+ .custom-select {
327
+ position: relative;
328
+ min-width: 180px;
329
+ }
330
+
331
+ .custom-select-trigger {
332
+ display: flex;
333
+ align-items: center;
334
+ justify-content: space-between;
335
+ width: 100%;
336
+ padding: 8px 12px;
337
+ background: var(--bg);
338
+ border: 1px solid var(--border);
339
+ border-radius: var(--radius);
340
+ color: var(--text);
341
+ font-size: 0.875rem;
342
+ font-family: inherit;
343
+ cursor: pointer;
344
+ transition: border-color 0.15s;
345
+ text-align: left;
346
+ }
347
+
348
+ .custom-select-trigger:hover,
349
+ .custom-select-trigger:focus {
350
+ border-color: var(--accent);
351
+ outline: none;
352
+ }
353
+
354
+ .custom-select-arrow {
355
+ font-size: 0.7rem;
356
+ color: var(--text-dim);
357
+ margin-left: 8px;
358
+ }
359
+
360
+ .custom-select-dropdown {
361
+ position: absolute;
362
+ top: calc(100% + 4px);
363
+ left: 0;
364
+ right: 0;
365
+ background: var(--surface);
366
+ border: 1px solid var(--border);
367
+ border-radius: var(--radius);
368
+ padding: 4px;
369
+ z-index: 100;
370
+ max-height: 240px;
371
+ overflow-y: auto;
372
+ box-shadow: 0 8px 24px rgba(0, 0, 0, 0.4);
373
+ }
374
+
375
+ .custom-select-option {
376
+ display: block;
377
+ width: 100%;
378
+ padding: 8px 10px;
379
+ background: none;
380
+ border: none;
381
+ border-radius: 6px;
382
+ color: var(--text-dim);
383
+ font-size: 0.85rem;
384
+ font-family: inherit;
385
+ cursor: pointer;
386
+ text-align: left;
387
+ transition: all 0.1s;
388
+ }
389
+
390
+ .custom-select-option:hover {
391
+ background: var(--surface2);
392
+ color: var(--text);
393
+ }
394
+
395
+ .custom-select-option-active {
396
+ background: var(--accent);
397
+ color: #fff;
398
+ }
399
+
400
+ .custom-select-option-active:hover {
401
+ background: var(--accent-dim);
402
+ color: #fff;
403
+ }
404
+
405
+ /* ---- Server Error Banner ---- */
406
+ .server-error-banner {
407
+ background: #3a1a1a;
408
+ color: var(--err);
409
+ border: 1px solid #5a2a2a;
410
+ border-radius: var(--radius);
411
+ padding: 12px 16px;
412
+ margin-bottom: 20px;
413
+ font-size: 0.85rem;
414
+ line-height: 1.5;
415
+ }
416
+
417
+ /* ---- Content ---- */
418
+ .content {
419
+ min-height: 400px;
420
+ }
421
+
422
+ /* ---- Cards / Panels ---- */
423
+ .panel {
424
+ background: var(--surface);
425
+ border: 1px solid var(--border);
426
+ border-radius: var(--radius);
427
+ padding: 20px;
428
+ margin-bottom: 16px;
429
+ }
430
+
431
+ .panel h2 {
432
+ font-size: 1.1rem;
433
+ font-weight: 600;
434
+ margin-bottom: 12px;
435
+ }
436
+
437
+ .panel h3 {
438
+ font-size: 0.95rem;
439
+ font-weight: 600;
440
+ margin-bottom: 8px;
441
+ color: var(--text-dim);
442
+ }
443
+
444
+ /* ---- Forms ---- */
445
+ .form-row {
446
+ display: flex;
447
+ gap: 12px;
448
+ margin-bottom: 12px;
449
+ flex-wrap: wrap;
450
+ align-items: flex-end;
451
+ }
452
+
453
+ .form-group {
454
+ display: flex;
455
+ flex-direction: column;
456
+ gap: 4px;
457
+ flex: 1;
458
+ min-width: 180px;
459
+ }
460
+
461
+ .form-group label {
462
+ font-size: 0.8rem;
463
+ font-weight: 500;
464
+ color: var(--text-dim);
465
+ }
466
+
467
+ input,
468
+ textarea,
469
+ select {
470
+ padding: 8px 12px;
471
+ background: var(--bg);
472
+ border: 1px solid var(--border);
473
+ border-radius: var(--radius);
474
+ color: var(--text);
475
+ font-size: 0.875rem;
476
+ font-family: inherit;
477
+ }
478
+
479
+ input:focus,
480
+ textarea:focus,
481
+ select:focus {
482
+ outline: none;
483
+ border-color: var(--accent);
484
+ }
485
+
486
+ textarea {
487
+ resize: vertical;
488
+ min-height: 100px;
489
+ }
490
+
491
+ /* ---- Buttons ---- */
492
+ button.btn {
493
+ padding: 8px 20px;
494
+ border: none;
495
+ border-radius: var(--radius);
496
+ font-size: 0.875rem;
497
+ font-weight: 600;
498
+ cursor: pointer;
499
+ transition: background 0.15s, opacity 0.15s;
500
+ }
501
+
502
+ .btn-primary {
503
+ background: var(--accent);
504
+ color: #fff;
505
+ }
506
+ .btn-primary:hover:not(:disabled) {
507
+ background: var(--accent-dim);
508
+ }
509
+ .btn-secondary {
510
+ background: var(--surface2);
511
+ color: var(--text);
512
+ }
513
+ .btn-secondary:hover:not(:disabled) {
514
+ background: var(--border);
515
+ }
516
+
517
+ button:disabled {
518
+ opacity: 0.5;
519
+ cursor: not-allowed;
520
+ }
521
+
522
+ /* ---- Results ---- */
523
+ .result-card {
524
+ background: var(--surface2);
525
+ border: 1px solid var(--border);
526
+ border-radius: var(--radius);
527
+ padding: 16px;
528
+ margin-bottom: 10px;
529
+ transition: border-color 0.15s;
530
+ }
531
+
532
+ .result-card:hover {
533
+ border-color: var(--accent-dim);
534
+ }
535
+
536
+ .result-card .result-header {
537
+ display: flex;
538
+ justify-content: space-between;
539
+ align-items: center;
540
+ margin-bottom: 8px;
541
+ gap: 8px;
542
+ }
543
+
544
+ .result-card-selected {
545
+ border-color: var(--accent);
546
+ }
547
+
548
+ .result-card .result-text {
549
+ font-size: 0.85rem;
550
+ color: var(--text-dim);
551
+ line-height: 1.6;
552
+ }
553
+
554
+ .score-bar-container {
555
+ display: flex;
556
+ align-items: center;
557
+ gap: 8px;
558
+ }
559
+
560
+ .score-bar {
561
+ width: 120px;
562
+ height: 6px;
563
+ background: var(--bg);
564
+ border-radius: 3px;
565
+ overflow: hidden;
566
+ }
567
+
568
+ .score-bar-fill {
569
+ height: 100%;
570
+ border-radius: 3px;
571
+ transition: width 0.3s;
572
+ }
573
+
574
+ .score-label {
575
+ font-size: 0.8rem;
576
+ font-weight: 700;
577
+ font-variant-numeric: tabular-nums;
578
+ min-width: 48px;
579
+ text-align: right;
580
+ }
581
+
582
+ /* ---- Similarity gauge ---- */
583
+ .similarity-gauge {
584
+ display: flex;
585
+ align-items: center;
586
+ justify-content: center;
587
+ flex-direction: column;
588
+ padding: 24px;
589
+ }
590
+
591
+ .similarity-value {
592
+ font-size: 3rem;
593
+ font-weight: 800;
594
+ font-variant-numeric: tabular-nums;
595
+ }
596
+
597
+ .similarity-label {
598
+ font-size: 0.9rem;
599
+ color: var(--text-dim);
600
+ margin-top: 4px;
601
+ }
602
+
603
+ /* ---- Status / Alerts ---- */
604
+ .status {
605
+ padding: 10px 14px;
606
+ border-radius: var(--radius);
607
+ font-size: 0.85rem;
608
+ margin-bottom: 12px;
609
+ }
610
+ .status-ok {
611
+ background: #1a3a2a;
612
+ color: var(--ok);
613
+ }
614
+ .status-err {
615
+ background: #3a1a1a;
616
+ color: var(--err);
617
+ }
618
+ .status-loading {
619
+ background: var(--surface2);
620
+ color: var(--text-dim);
621
+ }
622
+
623
+ /* ---- Table ---- */
624
+ .data-table {
625
+ width: 100%;
626
+ border-collapse: collapse;
627
+ font-size: 0.85rem;
628
+ }
629
+ .data-table th,
630
+ .data-table td {
631
+ padding: 8px 12px;
632
+ text-align: left;
633
+ border-bottom: 1px solid var(--border);
634
+ }
635
+ .data-table th {
636
+ color: var(--text-dim);
637
+ font-weight: 600;
638
+ font-size: 0.8rem;
639
+ text-transform: uppercase;
640
+ letter-spacing: 0.5px;
641
+ }
642
+ .data-table tr:hover td {
643
+ background: var(--surface2);
644
+ }
645
+ .data-table input,
646
+ .data-table select {
647
+ font-size: 0.85rem;
648
+ }
649
+
650
+ /* ---- Grid ---- */
651
+ .grid-2 {
652
+ display: grid;
653
+ grid-template-columns: 1fr 1fr;
654
+ gap: 16px;
655
+ }
656
+
657
+ @media (max-width: 768px) {
658
+ .grid-2 {
659
+ grid-template-columns: 1fr;
660
+ }
661
+ }
662
+
663
+ /* ---- Metric Card ---- */
664
+ .metric-card {
665
+ background: var(--surface2);
666
+ border: 1px solid var(--border);
667
+ border-radius: var(--radius);
668
+ padding: 16px;
669
+ text-align: center;
670
+ }
671
+
672
+ .metric-value {
673
+ font-size: 1.75rem;
674
+ font-weight: 700;
675
+ color: var(--text);
676
+ font-variant-numeric: tabular-nums;
677
+ }
678
+
679
+ .metric-label {
680
+ font-size: 0.78rem;
681
+ color: var(--text-dim);
682
+ margin-top: 4px;
683
+ text-transform: uppercase;
684
+ letter-spacing: 0.3px;
685
+ }
686
+
687
+ /* ---- Spinner ---- */
688
+ .spinner {
689
+ display: inline-block;
690
+ width: 16px;
691
+ height: 16px;
692
+ border: 2px solid var(--text-dim);
693
+ border-top-color: var(--accent);
694
+ border-radius: 50%;
695
+ animation: spin 0.6s linear infinite;
696
+ margin-right: 6px;
697
+ vertical-align: middle;
698
+ }
699
+
700
+ @keyframes spin {
701
+ to {
702
+ transform: rotate(360deg);
703
+ }
704
+ }
705
+
706
+ /* ---- Tags ---- */
707
+ .tag {
708
+ display: inline-block;
709
+ padding: 2px 8px;
710
+ border-radius: 4px;
711
+ font-size: 0.75rem;
712
+ font-weight: 600;
713
+ background: var(--surface);
714
+ margin: 2px;
715
+ }
716
+
717
+ .tag-best {
718
+ background: #1a3a2a;
719
+ color: var(--ok);
720
+ }
721
+
722
+ /* ---- Utility classes ---- */
723
+ .panel-desc {
724
+ color: var(--text-dim);
725
+ font-size: 0.85rem;
726
+ margin-bottom: 12px;
727
+ }
728
+
729
+ .section-label {
730
+ font-size: 0.8rem;
731
+ font-weight: 600;
732
+ color: var(--text-dim);
733
+ margin-bottom: 4px;
734
+ }
735
+
736
+ .text-dim { color: var(--text-dim); }
737
+
738
+ .form-group-sm { max-width: 100px; }
739
+ .form-group-md { max-width: 140px; }
740
+ .form-group-lg { max-width: 220px; }
741
+
742
+ .metric-grid {
743
+ display: flex;
744
+ gap: 16px;
745
+ flex-wrap: wrap;
746
+ }
747
+ .metric-grid > * {
748
+ flex: 1 1 100px;
749
+ }
750
+
751
+ .flex-row { display: flex; gap: 8px; }
752
+ .flex-col { display: flex; flex-direction: column; }
753
+ .flex-wrap { flex-wrap: wrap; }
754
+
755
+ .gap-1 { gap: 8px; }
756
+ .gap-2 { gap: 12px; }
757
+ .gap-3 { gap: 16px; }
758
+
759
+ .mt-1 { margin-top: 8px; }
760
+ .mt-2 { margin-top: 12px; }
761
+ .mt-3 { margin-top: 16px; }
762
+ .mb-1 { margin-bottom: 8px; }
763
+ .mb-2 { margin-bottom: 12px; }
764
+ .mb-3 { margin-bottom: 16px; }
765
+
766
+ /* ---- Context Analysis bar chart ---- */
767
+ .context-bar-row {
768
+ display: flex;
769
+ align-items: center;
770
+ gap: 10px;
771
+ margin-bottom: 6px;
772
+ }
773
+
774
+ .context-bar-label {
775
+ width: 90px;
776
+ font-size: 0.82rem;
777
+ font-weight: 600;
778
+ text-align: right;
779
+ color: var(--text);
780
+ flex-shrink: 0;
781
+ }
782
+
783
+ .context-bar-track {
784
+ flex: 1;
785
+ height: 8px;
786
+ background: var(--bg);
787
+ border-radius: 4px;
788
+ overflow: hidden;
789
+ }
790
+
791
+ .context-bar-fill {
792
+ height: 100%;
793
+ background: var(--accent);
794
+ border-radius: 4px;
795
+ transition: width 0.3s;
796
+ }
797
+
798
+ .context-bar-value {
799
+ font-size: 0.75rem;
800
+ color: var(--text-dim);
801
+ width: 40px;
802
+ text-align: right;
803
+ flex-shrink: 0;
804
+ }
805
+
806
+ .context-snippet {
807
+ font-size: 0.8rem;
808
+ color: var(--text-dim);
809
+ line-height: 1.5;
810
+ padding: 8px 10px;
811
+ background: var(--bg);
812
+ border-radius: 6px;
813
+ margin-bottom: 4px;
814
+ }
815
+
816
+ .context-snippet-source {
817
+ font-size: 0.7rem;
818
+ color: var(--accent);
819
+ margin-right: 6px;
820
+ }
821
+
822
+ .score-bar-legend {
823
+ display: flex;
824
+ justify-content: space-between;
825
+ font-size: 0.75rem;
826
+ color: var(--text-dim);
827
+ margin-top: 4px;
828
+ }
frontend/src/types.ts ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // ---- API Request types ----
2
+
3
+ export interface InitRequest {
4
+ model_name: string;
5
+ chunk_size: number;
6
+ chunk_overlap: number;
7
+ batch_size: number;
8
+ }
9
+
10
+ export interface DocumentRequest {
11
+ doc_id: string;
12
+ text: string;
13
+ }
14
+
15
+ export interface QueryRequest {
16
+ text: string;
17
+ top_k: number;
18
+ }
19
+
20
+ export interface CompareRequest {
21
+ text_a: string;
22
+ text_b: string;
23
+ }
24
+
25
+ export interface KeywordAnalysisRequest {
26
+ keyword: string;
27
+ top_k: number;
28
+ cluster_threshold: number;
29
+ }
30
+
31
+ export interface KeywordMatchRequest {
32
+ keyword: string;
33
+ candidate_meanings: string[];
34
+ }
35
+
36
+ export interface BatchAnalysisRequest {
37
+ keywords: string[];
38
+ top_k: number;
39
+ cluster_threshold: number;
40
+ compare_across: boolean;
41
+ }
42
+
43
+ // ---- API Response types ----
44
+
45
+ export interface ChunkPreview {
46
+ index: number;
47
+ text: string;
48
+ }
49
+
50
+ export interface InitResponse {
51
+ status: string;
52
+ model: string;
53
+ load_time_seconds: number;
54
+ }
55
+
56
+ export interface AddDocResponse {
57
+ status: string;
58
+ doc_id: string;
59
+ num_chunks: number;
60
+ chunks_preview: ChunkPreview[];
61
+ }
62
+
63
+ export interface BuildIndexResponse {
64
+ status: string;
65
+ total_chunks: number;
66
+ embedding_dim: number;
67
+ build_time_seconds: number;
68
+ }
69
+
70
+ export interface QueryResultItem {
71
+ rank: number;
72
+ score: number;
73
+ doc_id: string;
74
+ chunk_index: number;
75
+ text: string;
76
+ }
77
+
78
+ export interface QueryResponse {
79
+ query: string;
80
+ results: QueryResultItem[];
81
+ }
82
+
83
+ export interface CompareResponse {
84
+ text_a: string;
85
+ text_b: string;
86
+ similarity: number;
87
+ }
88
+
89
+ export interface ClusterContext {
90
+ doc_id: string;
91
+ chunk_index: number;
92
+ text: string;
93
+ highlight_positions: [number, number][];
94
+ }
95
+
96
+ export interface SimilarPassage {
97
+ rank: number;
98
+ score: number;
99
+ doc_id: string;
100
+ text: string;
101
+ }
102
+
103
+ export interface MeaningCluster {
104
+ cluster_id: number;
105
+ size: number;
106
+ representative_text: string;
107
+ contexts: ClusterContext[];
108
+ similar_passages: SimilarPassage[];
109
+ }
110
+
111
+ export interface KeywordAnalysisResponse {
112
+ keyword: string;
113
+ total_occurrences: number;
114
+ meaning_clusters: MeaningCluster[];
115
+ cross_keyword_similarities: Record<string, number>;
116
+ }
117
+
118
+ export interface MatchResult {
119
+ doc_id: string;
120
+ chunk_index: number;
121
+ text: string;
122
+ best_match: string;
123
+ best_score: number;
124
+ all_scores: Record<string, number>;
125
+ }
126
+
127
+ export interface MatchResponse {
128
+ keyword: string;
129
+ candidate_meanings: string[];
130
+ matches: MatchResult[];
131
+ }
132
+
133
+ export interface CorpusStats {
134
+ total_chunks: number;
135
+ total_documents: number;
136
+ document_ids: string[];
137
+ index_built: boolean;
138
+ embedding_dim: number;
139
+ model_name: string;
140
+ }
141
+
142
+ export interface SimilarityDistribution {
143
+ sample_size: number;
144
+ mean: number;
145
+ std: number;
146
+ min: number;
147
+ max: number;
148
+ percentiles: Record<string, number>;
149
+ histogram: { bin_start: number; bin_end: number; count: number }[];
150
+ }
151
+
152
+ export interface DisambiguationMetric {
153
+ keyword: string;
154
+ accuracy: number;
155
+ weighted_f1: number;
156
+ per_meaning_precision: Record<string, number>;
157
+ per_meaning_recall: Record<string, number>;
158
+ per_meaning_f1: Record<string, number>;
159
+ confusion_matrix: number[][];
160
+ total_samples: number;
161
+ }
162
+
163
+ export interface RetrievalMetric {
164
+ query: string;
165
+ mrr: number;
166
+ precision_at_k: Record<string, number>;
167
+ recall_at_k: Record<string, number>;
168
+ ndcg_at_k: Record<string, number>;
169
+ avg_similarity: number;
170
+ top_score: number;
171
+ }
172
+
173
+ // ---- Training types ----
174
+
175
+ export interface TrainResponse {
176
+ strategy: string;
177
+ model_path: string;
178
+ training_pairs: number;
179
+ epochs: number;
180
+ seconds: number;
181
+ keywords?: string[];
182
+ }
183
+
184
+ export interface TrainEvalResponse {
185
+ pairs: {
186
+ text_a: string;
187
+ text_b: string;
188
+ expected: number;
189
+ base_score: number;
190
+ trained_score: number;
191
+ base_error: number;
192
+ trained_error: number;
193
+ }[];
194
+ summary: {
195
+ avg_base_error: number;
196
+ avg_trained_error: number;
197
+ error_reduction_pct: number;
198
+ improved: number;
199
+ degraded: number;
200
+ total: number;
201
+ };
202
+ }
203
+
204
+ // ---- Word2Vec types ----
205
+
206
+ export interface W2VInitResponse {
207
+ vocab_size: number;
208
+ sentences: number;
209
+ vector_size: number;
210
+ seconds: number;
211
+ }
212
+
213
+ export interface W2VQueryResult {
214
+ rank: number;
215
+ score: number;
216
+ doc_id: string;
217
+ text: string;
218
+ }
219
+
220
+ export interface W2VSimilarWord {
221
+ word: string;
222
+ score: number;
223
+ }
224
+
225
+ // ---- Dataset types ----
226
+
227
+ export interface DatasetSourceInfo {
228
+ dataset_id: string;
229
+ url: string;
230
+ description: string;
231
+ columns?: string[];
232
+ size_mb?: number;
233
+ model?: string;
234
+ vector_dim?: number;
235
+ }
236
+
237
+ export interface DatasetInfo {
238
+ raw_texts: DatasetSourceInfo;
239
+ embeddings: DatasetSourceInfo;
240
+ }
241
+
242
+ export interface DatasetLoadRequest {
243
+ source: "raw" | "embeddings";
244
+ max_docs: number;
245
+ min_text_length: number;
246
+ source_filter?: string;
247
+ build_index: boolean;
248
+ }
249
+
250
+ export interface DatasetLoadResponse {
251
+ documents_loaded?: number;
252
+ documents_skipped?: number;
253
+ documents_created?: number;
254
+ total_chunks?: number;
255
+ chunks_indexed?: number;
256
+ chromadb_vectors?: number;
257
+ index_built: boolean;
258
+ seconds: number;
259
+ source?: string;
260
+ }
261
+
262
+ export interface DatasetPreviewDoc {
263
+ doc_id: string;
264
+ filename: string;
265
+ text_preview: string;
266
+ text_length: number;
267
+ }
268
+
269
+ export interface DatasetPreviewResponse {
270
+ count: number;
271
+ documents: DatasetPreviewDoc[];
272
+ }
273
+
274
+ // ---- Context Analysis types ----
275
+
276
+ export interface ContextAssociatedWord {
277
+ word: string;
278
+ score: number;
279
+ }
280
+
281
+ export interface ContextExample {
282
+ doc_id: string;
283
+ snippet: string;
284
+ }
285
+
286
+ export interface ContextMeaning {
287
+ cluster_id: number;
288
+ occurrences: number;
289
+ confidence: number;
290
+ associated_words: ContextAssociatedWord[];
291
+ example_contexts: ContextExample[];
292
+ }
293
+
294
+ export interface ContextAnalysisResponse {
295
+ keyword: string;
296
+ total_occurrences: number;
297
+ meanings: ContextMeaning[];
298
+ }
299
+
300
+ // ---- UI State ----
301
+
302
+ export type EvalSection = "distribution" | "disambiguation" | "retrieval";
frontend/src/utils/colors.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ /** Map a 0–1 similarity/score to a CSS color variable. */
2
+ export function scoreColor(score: number): string {
3
+ if (score >= 0.7) return "var(--ok)";
4
+ if (score >= 0.4) return "var(--warn)";
5
+ return "var(--err)";
6
+ }
frontend/src/vite-env.d.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ /// <reference types="vite/client" />
frontend/tsconfig.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "useDefineForClassFields": true,
5
+ "lib": ["ES2020", "DOM", "DOM.Iterable"],
6
+ "module": "ESNext",
7
+ "skipLibCheck": true,
8
+ "moduleResolution": "bundler",
9
+ "allowImportingTsExtensions": true,
10
+ "isolatedModules": true,
11
+ "moduleDetection": "force",
12
+ "noEmit": true,
13
+ "jsx": "react-jsx",
14
+ "strict": true,
15
+ "noUnusedLocals": false,
16
+ "noUnusedParameters": false,
17
+ "noFallthroughCasesInSwitch": true,
18
+ "forceConsistentCasingInFileNames": true
19
+ },
20
+ "include": ["src"]
21
+ }
frontend/vite.config.ts ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+
4
+ export default defineConfig({
5
+ plugins: [react()],
6
+ server: {
7
+ proxy: {
8
+ "/api/logs/stream": {
9
+ target: "http://localhost:8000",
10
+ headers: { "Accept": "text/event-stream" },
11
+ },
12
+ "/api": "http://localhost:8000",
13
+ },
14
+ },
15
+ });
pyproject.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "esfiles-ndr"
3
+ version = "1.0.0"
4
+ description = "Contextual word similarity analysis using transformer embeddings and Word2Vec baseline"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "sentence-transformers>=5.2.3",
8
+ "faiss-cpu>=1.13.2",
9
+ "torch>=2.10.0",
10
+ "numpy>=2.4.3",
11
+ "scikit-learn>=1.8.0",
12
+ "tqdm>=4.67.3",
13
+ "gensim>=4.4.0",
14
+ "fastapi>=0.135.1",
15
+ "uvicorn[standard]>=0.41.0",
16
+ "python-multipart>=0.0.22",
17
+ "accelerate>=1.13.0",
18
+ "datasets>=4.7.0",
19
+ "chromadb>=1.5.4",
20
+ ]
21
+
22
+ [project.scripts]
23
+ serve = "server:main"
24
+ demo = "demo:main"
25
+
26
+ [tool.uv]
27
+ dev-dependencies = []
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence-transformers>=5.2.3
2
+ faiss-cpu>=1.13.2
3
+ torch>=2.10.0
4
+ numpy>=2.4.3
5
+ scikit-learn>=1.8.0
6
+ tqdm>=4.67.3
7
+ gensim>=4.4.0
8
+ fastapi>=0.135.1
9
+ uvicorn[standard]>=0.41.0
10
+ python-multipart>=0.0.22
11
+ datasets>=4.7.0
12
+ chromadb>=1.5.4