Spaces:

caisdev
/

esfiles

Running

App Files Files Community

Besjon Cifliku commited on Mar 13

Commit

db764ae

1 Parent(s): 9f009c2

feat: initial project setup

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +42 -0
.gitignore +55 -0
Dockerfile +59 -0
HOWTO.md +390 -0
README.md +193 -2
contextual_similarity.py +850 -0
data_loader.py +286 -0
demo.py +233 -0
docker-compose.yml +18 -0
evaluation.py +547 -0
frontend/.gitignore +24 -0
frontend/README.md +16 -0
frontend/eslint.config.js +29 -0
frontend/index.html +12 -0
frontend/package-lock.json +0 -0
frontend/package.json +30 -0
frontend/public/vite.svg +1 -0
frontend/src/App.tsx +182 -0
frontend/src/api.ts +144 -0
frontend/src/assets/react.svg +1 -0
frontend/src/components/BatchAnalysis.tsx +110 -0
frontend/src/components/ContextAnalysis.tsx +116 -0
frontend/src/components/DatasetPanel.tsx +246 -0
frontend/src/components/EngineSetup.tsx +172 -0
frontend/src/components/EvaluationDashboard.tsx +603 -0
frontend/src/components/KeywordAnalysis.tsx +100 -0
frontend/src/components/KeywordMatcher.tsx +90 -0
frontend/src/components/LogViewer.tsx +71 -0
frontend/src/components/MetricCard.tsx +16 -0
frontend/src/components/ScoreBar.tsx +19 -0
frontend/src/components/Select.tsx +60 -0
frontend/src/components/SemanticSearch.tsx +70 -0
frontend/src/components/SimilarWords.tsx +75 -0
frontend/src/components/StatusMessage.tsx +13 -0
frontend/src/components/Switch.tsx +22 -0
frontend/src/components/TextCompare.tsx +84 -0
frontend/src/components/Toggle.tsx +27 -0
frontend/src/components/TrainingPanel.tsx +349 -0
frontend/src/components/Word2VecPanel.tsx +293 -0
frontend/src/hooks/useApiCall.ts +34 -0
frontend/src/hooks/useCorpusLoader.ts +48 -0
frontend/src/main.tsx +9 -0
frontend/src/styles.css +828 -0
frontend/src/types.ts +302 -0
frontend/src/utils/colors.ts +6 -0
frontend/src/vite-env.d.ts +1 -0
frontend/tsconfig.json +21 -0
frontend/vite.config.ts +15 -0
pyproject.toml +27 -0
requirements.txt +12 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,42 @@

+# Generated data & model artifacts
+engine_state/
+chroma_epstein/
+checkpoints/
+trained_model/
+# Python
+__pycache__/
+*.py[cod]
+.venv/
+venv/
+*.egg-info/
+# Node (frontend is built inside Docker)
+frontend/node_modules/
+frontend/dist/
+# Git
+.git/
+.gitattributes
+# OS & IDE
+.DS_Store
+.vscode/
+.idea/
+# HuggingFace cache
+.cache/
+# Docs (not needed in image)
+HOWTO.md
+README.md
+# Docker (avoid recursive COPY)
+Dockerfile
+docker-compose.yml
+.dockerignore
+# Env & logs
+.env
+.env.local
+*.log

.gitignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+*.egg
+dist/
+build/
+.venv/
+venv/
+.Python
+# Node / Frontend
+frontend/node_modules/
+frontend/dist/
+frontend/dist-ssr/
+npm-debug.log*
+yarn-debug.log*
+pnpm-debug.log*
+# Generated data & model artifacts
+engine_state/
+chroma_epstein/
+checkpoints/
+trained_model/
+*.faiss
+*.npy
+*.pkl
+*.pickle
+# HuggingFace cache
+.cache/
+# OS
+.DS_Store
+Thumbs.db
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+# Environment
+.env
+.env.local
+.env.*.local
+# Logs
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# =============================================================
+#  Multi-stage Docker build for Contextual Similarity Engine
+#  Single container: React frontend + FastAPI backend
+#  Deploys to: HuggingFace Spaces (Docker SDK), local, Railway
+# =============================================================
+# Stage 1: Build frontend
+FROM node:22-slim AS frontend-build
+WORKDIR /app/frontend
+COPY frontend/package.json frontend/package-lock.json ./
+RUN npm ci
+COPY frontend/ ./
+RUN npm run build
+# Stage 2: Python runtime
+FROM python:3.12-slim AS runtime
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+# System deps for faiss-cpu and torch
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Install uv for fast dependency resolution
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+# Copy dependency files first (cache layer)
+COPY --chown=appuser pyproject.toml uv.lock ./
+# Install Python dependencies
+RUN uv sync --frozen --no-dev
+# Copy backend source
+COPY --chown=appuser *.py ./
+# Copy pre-built frontend
+COPY --chown=appuser --from=frontend-build /app/frontend/dist ./frontend/dist
+# Data directories (HF cache, engine state, trained models)
+RUN mkdir -p /data/huggingface /data/engine_state /data/trained_model \
+    && chown -R appuser:appuser /app /data
+ENV HF_HOME=/data/huggingface
+ENV TRANSFORMERS_CACHE=/data/huggingface
+ENV ENGINE_STATE_DIR=/data/engine_state
+# Switch to non-root user
+USER appuser
+# Expose port (HF Spaces expects 7860, override via PORT env)
+EXPOSE 7860
+# Run the server — HOST and PORT configurable via env
+ENV HOST=0.0.0.0
+ENV PORT=7860
+CMD ["uv", "run", "python", "server.py"]

HOWTO.md ADDED Viewed

	@@ -0,0 +1,390 @@

+# Contextual Similarity Engine — HOWTO
+## Overview
+This project uses **transformer-based sentence embeddings** to find and compare
+contextual meanings of keywords within large documents. Unlike Word2Vec (static,
+one-vector-per-word), this system **fine-tunes on YOUR corpus** so it learns
+domain-specific patterns — e.g. that "pizza" means "school" in your data.
+A **Word2Vec (gensim) baseline** is included for comparison, demonstrating why
+contextual embeddings are superior for meaning disambiguation.
+**The pipeline is: TRAIN → INDEX → ANALYZE → EVALUATE.**
+**Stack:**
+- **SentenceTransformers** — contextual embeddings (PyTorch)
+- **FAISS** — fast vector similarity search
+- **gensim Word2Vec** — static embedding baseline for comparison
+- **FastAPI** — REST API backend
+- **React + TypeScript** — visualization frontend
+- **scikit-learn** — clustering & evaluation metrics
+---
+## 1. Install Dependencies
+### Python backend (uv — recommended)
+[uv](https://docs.astral.sh/uv/) is a fast Python package manager that replaces
+`pip`, `venv`, and `requirements.txt` with a single tool and lockfile.
+```bash
+# Install uv (if not already installed)
+curl -LsSf https://astral.sh/uv/install.sh | sh
+# Create a virtual environment and install all dependencies from pyproject.toml
+cd esfiles
+uv sync
+# Run commands inside the managed environment
+uv run python server.py
+uv run python demo.py
+```
+`uv sync` reads `pyproject.toml`, resolves dependencies, creates a `.venv`,
+and generates a `uv.lock` lockfile for reproducible installs. The lockfile
+pins exact versions so every machine gets identical dependencies.
+**Adding/removing packages:**
+```bash
+uv add httpx              # add a new dependency
+uv remove httpx           # remove it
+uv lock --upgrade         # upgrade all packages to latest compatible versions
+```
+### Python backend (pip — alternative)
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+### React frontend
+```bash
+cd frontend
+npm install
+```
+---
+## 2. Quick Start
+### CLI demo (Word2Vec vs Transformer comparison)
+```bash
+uv run python demo.py
+```
+This runs side-by-side comparison:
+1. Builds both Transformer and Word2Vec engines on the same corpus
+2. Compares text similarity scores between approaches
+3. Shows word-level similarity (Word2Vec only — transformers don't do single words)
+4. Runs semantic search with both engines
+5. Tests keyword meaning matching ("pizza" → food or school?)
+6. Demonstrates clustering (transformer can separate meanings, Word2Vec cannot)
+### Web UI
+```bash
+# Terminal 1: start the API server
+uv run python server.py
+# Terminal 2: start the React dev server
+cd frontend && npm run dev
+```
+- API docs: `http://localhost:8000/docs`
+- Frontend: `http://localhost:5173`
+---
+## 3. Training Your Model
+Three strategies, from simplest to most powerful:
+### Strategy 1: Unsupervised (TSDAE)
+No labels needed. Learns your corpus vocabulary and phrasing via denoising autoencoder.
+```python
+from training import CorpusTrainer
+corpus_texts = [open(f).read() for f in your_files]
+trainer = CorpusTrainer(corpus_texts, base_model="all-MiniLM-L6-v2")
+result = trainer.train_unsupervised(
+    output_path="./trained_model",
+    epochs=3,
+    batch_size=16,
+)
+print(f"Trained on {result['training_pairs']} sentences in {result['seconds']}s")
+```
+### Strategy 2: Contrastive (auto-mined pairs)
+Adjacent sentences = similar, random sentences = dissimilar. Learns document structure
+using MultipleNegativesRankingLoss with in-batch negatives.
+```python
+trainer = CorpusTrainer(corpus_texts)
+result = trainer.train_contrastive(
+    output_path="./trained_model",
+    epochs=5,
+    batch_size=16,
+)
+```
+### Strategy 3: Keyword-supervised (best if you know the code words)
+You provide a keyword→meaning map. The trainer auto-generates training pairs:
+keyword-in-context ↔ meaning-substituted version, plus contrastive pairs from
+corpus structure.
+```python
+trainer = CorpusTrainer(corpus_texts)
+result = trainer.train_with_keywords(
+    keyword_meanings={"pizza": "school", "pepperoni": "math class"},
+    output_path="./trained_model",
+    epochs=5,
+    batch_size=16,
+)
+print(f"Keywords: {result['keywords']}")
+```
+### Verifying training worked
+```python
+# Compare base model vs trained model on test pairs
+comparison = trainer.evaluate_model(
+    test_pairs=[
+        ("pizza gives me homework", "school gives me homework", 0.95),
+        ("pizza gives me homework", "I ate delicious pizza", 0.1),
+        ("The pizza test is hard", "The school exam is difficult", 0.9),
+    ],
+    trained_model_path="./trained_model",
+)
+print(f"Base error:    {comparison['summary']['avg_base_error']:.4f}")
+print(f"Trained error: {comparison['summary']['avg_trained_error']:.4f}")
+print(f"Reduction:     {comparison['summary']['error_reduction_pct']:.1f}%")
+print(f"Improved:      {comparison['summary']['improved']}/{comparison['summary']['total']}")
+```
+---
+## 4. Using Your Trained Model
+After training, use the saved model path instead of the pretrained model name:
+```python
+from contextual_similarity import ContextualSimilarityEngine
+engine = ContextualSimilarityEngine(model_name="./trained_model")
+engine.add_document("doc1", open("doc1.txt").read())
+engine.build_index()
+# Queries now use your domain-trained embeddings
+results = engine.query("pizza homework", top_k=10)
+matches = engine.match_keyword_to_meaning("pizza", [
+    "Italian food, restaurant, cooking",
+    "School, education, homework and tests",
+])
+```
+---
+## 5. Word2Vec Baseline Comparison
+A gensim Word2Vec engine is included to demonstrate the difference between
+static and contextual embeddings:
+```python
+from word2vec_baseline import Word2VecEngine
+w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
+for doc_id, text in docs.items():
+    w2v.add_document(doc_id, text)
+w2v.build_index()
+# Word-level: which words appear in similar contexts?
+w2v.most_similar_words("pizza", top_k=5)
+# Sentence-level: averaged word vectors (lossy)
+w2v.compare_texts("pizza gives me homework", "school gives me homework")
+# Search
+w2v.query("a place where children learn", top_k=3)
+```
+**Key limitation:** Word2Vec gives ONE vector per word. "pizza" always has the
+same embedding whether it means food or school. Transformers encode the full
+surrounding context, so the same word gets different embeddings in different passages.
+---
+## 6. Using the Web UI
+1. **Train Model** (start here):
+   - Paste your corpus (documents separated by blank lines)
+   - Choose strategy: Unsupervised, Contrastive, or Keyword-supervised
+   - For keyword strategy, provide a JSON keyword→meaning map
+   - Configure base model, epochs, batch size, output path
+   - Click "Start Training" — model trains and saves to disk
+   - Run "Compare Models" to evaluate base vs trained
+2. **Setup:**
+   - Initialize engine with your trained model path (e.g. `./trained_model`)
+   - Add documents and build the FAISS index
+3. **Semantic Search:** query the corpus with trained embeddings
+4. **Compare Texts:** cosine similarity between any two texts
+5. **Keyword Analysis:** auto-cluster keyword meanings across documents
+6. **Keyword Matcher:** match keyword occurrences to candidate meanings
+7. **Batch Analysis:** multi-keyword analysis with cross-similarity matrix
+8. **Evaluation:** disambiguation accuracy, retrieval P@K/MRR, similarity histograms
+---
+## 7. API Endpoints
+### Training
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/train/unsupervised` | TSDAE domain adaptation |
+| POST | `/api/train/contrastive` | Contrastive with auto-mined pairs |
+| POST | `/api/train/keywords` | Keyword-supervised training |
+| POST | `/api/train/evaluate` | Compare base vs trained model |
+### Engine
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/init` | Initialize engine with a model |
+| POST | `/api/documents` | Add a document to the corpus |
+| POST | `/api/documents/upload` | Upload a file as a document |
+| POST | `/api/index/build` | Build FAISS index |
+| POST | `/api/query` | Semantic search |
+| POST | `/api/compare` | Compare two texts |
+| POST | `/api/analyze/keyword` | Single keyword analysis |
+| POST | `/api/analyze/batch` | Multi-keyword batch analysis |
+| POST | `/api/match` | Match keyword to candidate meanings |
+| GET  | `/api/stats` | Corpus statistics |
+### Evaluation
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/eval/disambiguation` | Disambiguation accuracy |
+| POST | `/api/eval/retrieval` | Retrieval metrics (P@K, MRR, NDCG) |
+| GET  | `/api/eval/similarity-distribution` | Pairwise similarity histogram |
+### Word2Vec Baseline
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/w2v/init` | Train Word2Vec on corpus |
+| POST | `/api/w2v/compare` | Compare two texts (averaged word vectors) |
+| POST | `/api/w2v/query` | Search corpus |
+| POST | `/api/w2v/similar-words` | Find similar words |
+---
+## 8. Available Base Models
+| Model | Dim | Size | Quality | Speed |
+|-------|-----|------|---------|-------|
+| `all-MiniLM-L6-v2` | 384 | ~80MB | Good | Fast |
+| `all-mpnet-base-v2` | 768 | ~420MB | Best | Medium |
+Start with `all-MiniLM-L6-v2` for fast iteration, upgrade to `all-mpnet-base-v2`
+for production quality.
+---
+## 9. Evaluation Metrics
+| Metric | What it measures |
+|--------|-----------------|
+| **Accuracy** | % of keyword occurrences correctly matched to their meaning |
+| **Weighted F1** | Harmonic mean of precision/recall, weighted by class frequency |
+| **MRR** | Mean Reciprocal Rank — how early the first relevant result appears |
+| **P@K** | Precision at K — fraction of top-K results that are relevant |
+| **NDCG@K** | Normalized Discounted Cumulative Gain — ranking quality metric |
+---
+## 10. Tuning Parameters
+### Training
+| Parameter | Default | Notes |
+|-----------|---------|-------|
+| `epochs` | 3-5 | More = better fit but risk overfitting |
+| `batch_size` | 16 | Larger = faster, needs more memory. MNRL benefits from larger batches |
+| `context_window` | 2 | (Keyword strategy) sentences around keyword to include as context |
+### Engine
+| Parameter | Default | Notes |
+|-----------|---------|-------|
+| `chunk_size` | 512 | Characters per chunk. Larger = more context per chunk |
+| `chunk_overlap` | 128 | Overlap prevents losing context at chunk boundaries |
+| `batch_size` | 64 | Encoding batch size for FAISS indexing |
+---
+## 11. Computational Resources
+| Task | CPU | GPU (CUDA/MPS) | RAM |
+|------|-----|----------------|-----|
+| Training (small, <1K pairs) | OK | Faster (2-5x) | 4GB+ |
+| Training (medium, 1K-10K pairs) | Slow | Recommended | 8GB+ |
+| Training (large, 10K+ pairs) | Very slow | Required | 16GB+ |
+| Indexing (1K chunks) | OK | Faster | 4GB+ |
+| Querying | Fast | N/A | 2GB+ |
+**Minimum:** MacBook with 8GB RAM can train small models on CPU.
+**Recommended:** 16GB RAM + GPU (NVIDIA CUDA or Apple Silicon MPS).
+---
+## 12. Project Structure
+```
+esfiles/
+├── pyproject.toml              # Project config & dependencies (uv)
+├── requirements.txt            # Fallback for pip users
+├── contextual_similarity.py    # Core engine: chunking, embedding, FAISS, analysis
+├── training.py                 # Training pipeline: 3 strategies + evaluation
+├── evaluation.py               # Evaluation pipeline: metrics, reports
+├── word2vec_baseline.py        # Gensim Word2Vec baseline for comparison
+├── server.py                   # FastAPI REST API
+├── demo.py                     # CLI demo: Word2Vec vs Transformer comparison
+├── HOWTO.md                    # This file
+└── frontend/                   # React + TypeScript UI
+    ├── package.json
+    ├── tsconfig.json
+    ├── vite.config.ts
+    ├── index.html
+    └── src/
+        ├── main.tsx
+        ├── App.tsx
+        ├── styles.css
+        ├── types.ts
+        ├── api.ts
+        └── components/
+            ├── ScoreBar.tsx
+            ├── StatusMessage.tsx
+            ├── TrainingPanel.tsx
+            ├── EngineSetup.tsx
+            ├── SemanticSearch.tsx
+            ├── TextCompare.tsx
+            ├── KeywordAnalysis.tsx
+            ├── KeywordMatcher.tsx
+            ├── BatchAnalysis.tsx
+            └── EvaluationDashboard.tsx
+```

README.md CHANGED Viewed

@@ -1,12 +1,203 @@
 ---
 title: Esfiles
-emoji: 🏢
 colorFrom: green
 colorTo: green
 sdk: docker
 pinned: false
 license: apache-2.0
 short_description: 'A prototype to analyze embeddings and word correlations '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Esfiles
+emoji: "\U0001F3E2"
 colorFrom: green
 colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
 license: apache-2.0
 short_description: 'A prototype to analyze embeddings and word correlations '
 ---
+# Esfiles — Contextual Similarity Engine
+A tool for analyzing word meanings in context using **transformer-based embeddings**. Unlike traditional approaches (Word2Vec) that assign one static vector per word, this system **fine-tunes on your corpus** so the same word gets different embeddings depending on its surrounding context — e.g. detecting that "pizza" is used as code for "school" in a set of documents.
+Includes a **Word2Vec baseline** for side-by-side comparison.
+## Stack
+| Layer | Technology |
+|-------|-----------|
+| Embeddings | SentenceTransformers (PyTorch) |
+| Vector search | FAISS |
+| Baseline | gensim Word2Vec |
+| Backend | FastAPI (Python) |
+| Frontend | React 19 + TypeScript + Vite |
+| Evaluation | scikit-learn metrics |
+| Deployment | Docker (HuggingFace Spaces, local, Railway) |
+## Prerequisites
+- **Python 3.11+**
+- **Node.js 18+** (for frontend)
+- [uv](https://docs.astral.sh/uv/) (recommended) or pip
+## Setup
+### 1. Clone the repo
+```bash
+git clone <repo-url>
+cd esfiles
+```
+### 2. Install Python dependencies
+**With uv (recommended):**
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+uv sync
+```
+**With pip:**
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+### 3. Install frontend dependencies
+```bash
+cd frontend
+npm install
+cd ..
+```
+## Usage
+### CLI demo
+Run the Word2Vec vs Transformer comparison demo:
+```bash
+uv run python demo.py
+```
+This builds both engines on a sample corpus and compares similarity scores, semantic search, keyword matching, and clustering.
+### Web UI (development)
+```bash
+# Terminal 1 — API server
+uv run python server.py
+# Terminal 2 — React dev server
+cd frontend && npm run dev
+```
+- **API docs:** http://localhost:8000/docs
+- **Frontend:** http://localhost:5173
+### Docker
+```bash
+docker compose up --build
+```
+The app will be available at http://localhost:8000. The Docker build compiles the React frontend and bundles it with the FastAPI server in a single container.
+## How it works
+**Pipeline: TRAIN → INDEX → ANALYZE → EVALUATE**
+1. **Train** — Fine-tune a pretrained sentence-transformer on your corpus using one of three strategies:
+   - **Unsupervised (TSDAE):** No labels needed. Learns vocabulary and phrasing via denoising autoencoder.
+   - **Contrastive:** Auto-mines training pairs from document structure (adjacent sentences = similar).
+   - **Keyword-supervised:** You provide a keyword→meaning map (e.g. `{"pizza": "school"}`). The trainer generates context-aware training pairs.
+2. **Index** — Chunk your documents and encode them into a FAISS vector index using the fine-tuned model.
+3. **Analyze** — Query the index with semantic search, compare texts, analyze keyword meanings across documents, or match keywords to candidate meanings.
+4. **Evaluate** — Measure disambiguation accuracy, retrieval metrics (P@K, MRR, NDCG), and clustering quality (NMI).
+## API endpoints
+### Training
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/train/unsupervised` | TSDAE domain adaptation |
+| POST | `/api/train/contrastive` | Contrastive with auto-mined pairs |
+| POST | `/api/train/keywords` | Keyword-supervised training |
+| POST | `/api/train/evaluate` | Compare base vs trained model |
+### Engine
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/init` | Initialize engine with a model |
+| POST | `/api/documents` | Add a document |
+| POST | `/api/documents/upload` | Upload a file as a document |
+| POST | `/api/index/build` | Build FAISS index |
+| POST | `/api/query` | Semantic search |
+| POST | `/api/compare` | Compare two texts |
+| POST | `/api/analyze/keyword` | Single keyword analysis |
+| POST | `/api/analyze/batch` | Multi-keyword batch analysis |
+| POST | `/api/match` | Match keyword to candidate meanings |
+| GET  | `/api/stats` | Corpus statistics |
+### Evaluation
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/eval/disambiguation` | Disambiguation accuracy |
+| POST | `/api/eval/retrieval` | Retrieval metrics (P@K, MRR, NDCG) |
+| GET  | `/api/eval/similarity-distribution` | Pairwise similarity histogram |
+### Word2Vec baseline
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/api/w2v/init` | Train Word2Vec on corpus |
+| POST | `/api/w2v/compare` | Compare two texts |
+| POST | `/api/w2v/query` | Search corpus |
+| POST | `/api/w2v/similar-words` | Find similar words |
+Full interactive docs available at `/docs` when the server is running.
+## Project structure
+```
+esfiles/
+├── pyproject.toml              # Dependencies (uv)
+├── requirements.txt            # Fallback for pip
+├── uv.lock                     # Lockfile for reproducible installs
+├── contextual_similarity.py    # Core engine: chunking, embedding, FAISS, analysis
+├── training.py                 # Training pipeline: 3 strategies + evaluation
+├── evaluation.py               # Evaluation: metrics, reports
+├── word2vec_baseline.py        # gensim Word2Vec baseline
+├── data_loader.py              # Epstein Files dataset loader (HuggingFace + ChromaDB)
+├── server.py                   # FastAPI REST API
+├── demo.py                     # CLI demo: Word2Vec vs Transformer comparison
+├── Dockerfile                  # Multi-stage build (Node + Python)
+├── docker-compose.yml          # Local Docker setup
+├── HOWTO.md                    # In-depth usage guide
+└── frontend/                   # React + TypeScript UI
+    ├── package.json
+    ├── vite.config.ts
+    ├── index.html
+    └── src/
+        ├── App.tsx             # Main app with tab navigation
+        ├── api.ts              # API client
+        ├── types.ts            # TypeScript types
+        └── components/         # UI components (training, search, evaluation, etc.)
+```
+## Base models
+| Model | Dimensions | Quality | Speed |
+|-------|-----------|---------|-------|
+| `all-MiniLM-L6-v2` | 384 | Good | Fast |
+| `all-mpnet-base-v2` | 768 | Best | Medium |
+Start with `all-MiniLM-L6-v2` for iteration, use `all-mpnet-base-v2` for production.
+## Further reading
+See [HOWTO.md](HOWTO.md) for detailed usage examples including Python API usage, training configuration, tuning parameters, and evaluation metrics.
+## License
+Apache 2.0

contextual_similarity.py ADDED Viewed

	@@ -0,0 +1,850 @@

+"""
+Contextual Word Similarity Engine
+Uses transformer-based sentence embeddings (SentenceTransformers) and FAISS
+vector search to find and compare contextual meanings of keywords within
+large documents. Unlike static embeddings (Word2Vec/GloVe), this captures
+how word meaning changes based on surrounding context.
+Usage:
+    engine = ContextualSimilarityEngine()
+    engine.add_document("my_doc", text)
+    engine.build_index()
+    results = engine.analyze_keyword("pizza", top_k=10)
+"""
+import re
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+from sklearn.cluster import AgglomerativeClustering
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+@dataclass
+class Chunk:
+    """A passage of text from a document with metadata."""
+    text: str
+    doc_id: str
+    chunk_index: int
+    start_char: int
+    end_char: int
+    def __repr__(self):
+        preview = self.text[:80].replace("\n", " ")
+        return f"Chunk(doc={self.doc_id!r}, idx={self.chunk_index}, text={preview!r}...)"
+@dataclass
+class SimilarityResult:
+    """A single similarity match."""
+    chunk: Chunk
+    score: float
+    rank: int
+@dataclass
+class KeywordContext:
+    """A keyword occurrence with its surrounding context and embedding."""
+    keyword: str
+    chunk: Chunk
+    highlight_positions: list = field(default_factory=list)
+@dataclass
+class KeywordAnalysis:
+    """Full analysis of a keyword's contextual meanings across a corpus."""
+    keyword: str
+    total_occurrences: int
+    meaning_clusters: list = field(default_factory=list)
+    cross_keyword_similarities: dict = field(default_factory=dict)
+class ContextualSimilarityEngine:
+    """
+    Engine for contextual word similarity analysis using transformer embeddings.
+    Loads documents, chunks them into passages, embeds with a SentenceTransformer
+    model, indexes with FAISS, and provides methods to:
+      - Find all contextual usages of a keyword
+      - Cluster keyword usages into distinct meanings
+      - Compare keyword contexts across documents
+      - Find passages most similar to a query
+      - Batch-analyze multiple keywords
+    """
+    def __init__(
+        self,
+        model_name: str = "all-MiniLM-L6-v2",
+        chunk_size: int = 512,
+        chunk_overlap: int = 128,
+        device: Optional[str] = None,
+        batch_size: int = 64,
+    ):
+        """
+        Args:
+            model_name: HuggingFace SentenceTransformer model name.
+                - "all-MiniLM-L6-v2": fast, good quality (384-dim)
+                - "all-mpnet-base-v2": best quality general-purpose (768-dim)
+                - "BAAI/bge-large-en-v1.5": high accuracy, larger (1024-dim)
+            chunk_size: Max characters per chunk.
+            chunk_overlap: Overlap between consecutive chunks (preserves context at boundaries).
+            device: PyTorch device ("cpu", "cuda", "mps"). Auto-detected if None.
+            batch_size: Batch size for encoding (tune for your GPU memory).
+        """
+        logger.info(f"Loading model: {model_name}")
+        self._model_name = model_name
+        self.model = SentenceTransformer(model_name, device=device)
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.batch_size = batch_size
+        self.embedding_dim = self.model.get_sentence_embedding_dimension()
+        # Storage
+        self.chunks: list[Chunk] = []
+        self.embeddings: Optional[np.ndarray] = None
+        self.index: Optional[faiss.IndexFlatIP] = None
+        self._doc_ids: set[str] = set()
+    # ------------------------------------------------------------------ #
+    #  Document loading & chunking
+    # ------------------------------------------------------------------ #
+    def add_document(self, doc_id: str, text: str) -> list[Chunk]:
+        """
+        Chunk a document and add it to the corpus.
+        Args:
+            doc_id: Unique identifier for this document.
+            text: Full document text.
+        Returns:
+            List of Chunk objects created from this document.
+        """
+        if doc_id in self._doc_ids:
+            raise ValueError(f"Document '{doc_id}' already added. Use a unique doc_id.")
+        self._doc_ids.add(doc_id)
+        new_chunks = self._chunk_text(text, doc_id)
+        self.chunks.extend(new_chunks)
+        logger.info(f"Added document '{doc_id}': {len(new_chunks)} chunks")
+        # Invalidate index so user must rebuild
+        self.embeddings = None
+        self.index = None
+        return new_chunks
+    def add_document_from_file(self, file_path: str, doc_id: Optional[str] = None) -> list[Chunk]:
+        """Load a text file and add it as a document."""
+        path = Path(file_path).resolve()
+        base_dir = Path(__file__).parent.resolve()
+        if not path.is_relative_to(base_dir):
+            raise ValueError("File path must be within the project directory.")
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        text = path.read_text(encoding="utf-8")
+        return self.add_document(doc_id or path.stem, text)
+    def _chunk_text(self, text: str, doc_id: str) -> list[Chunk]:
+        """
+        Split text into overlapping chunks, breaking at sentence boundaries
+        when possible to preserve semantic coherence.
+        """
+        # Normalize whitespace
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        chunks = []
+        start = 0
+        chunk_idx = 0
+        while start < len(text):
+            end = start + self.chunk_size
+            # If we're not at the end, try to break at a sentence boundary
+            if end < len(text):
+                # Look for sentence-ending punctuation near the chunk boundary
+                search_region = text[max(end - 100, start):end]
+                # Find last sentence break in the search region
+                for sep in [". ", ".\n", "! ", "!\n", "? ", "?\n", "\n\n"]:
+                    last_break = search_region.rfind(sep)
+                    if last_break != -1:
+                        end = max(end - 100, start) + last_break + len(sep)
+                        break
+            chunk_text = text[start:end].strip()
+            if chunk_text:
+                chunks.append(Chunk(
+                    text=chunk_text,
+                    doc_id=doc_id,
+                    chunk_index=chunk_idx,
+                    start_char=start,
+                    end_char=end,
+                ))
+                chunk_idx += 1
+            # Advance with overlap
+            start = end - self.chunk_overlap if end < len(text) else end
+        return chunks
+    # ------------------------------------------------------------------ #
+    #  Embedding & indexing
+    # ------------------------------------------------------------------ #
+    def build_index(self, normalize: bool = True, show_progress: bool = True) -> None:
+        """
+        Embed all chunks and build a FAISS index for fast similarity search.
+        Args:
+            normalize: L2-normalize embeddings (enables cosine similarity via inner product).
+            show_progress: Show a progress bar during encoding.
+        """
+        if not self.chunks:
+            raise RuntimeError("No documents loaded. Call add_document() first.")
+        logger.info(f"Encoding {len(self.chunks)} chunks...")
+        texts = [c.text for c in self.chunks]
+        self.embeddings = self.model.encode(
+            texts,
+            batch_size=self.batch_size,
+            show_progress_bar=show_progress,
+            convert_to_numpy=True,
+            normalize_embeddings=normalize,
+        )
+        # Build FAISS inner-product index (cosine similarity when vectors are normalized)
+        self.index = faiss.IndexFlatIP(self.embedding_dim)
+        self.index.add(self.embeddings.astype(np.float32))
+        logger.info(f"Index built: {self.index.ntotal} vectors, dim={self.embedding_dim}")
+    # ------------------------------------------------------------------ #
+    #  Core query methods
+    # ------------------------------------------------------------------ #
+    def query(self, text: str, top_k: int = 10) -> list[SimilarityResult]:
+        """
+        Find the most similar chunks to a query text.
+        Args:
+            text: Query string (sentence, phrase, or keyword in context).
+            top_k: Number of results to return.
+        Returns:
+            List of SimilarityResult sorted by descending similarity score.
+        """
+        self._ensure_index()
+        query_vec = self.model.encode(
+            [text], normalize_embeddings=True, convert_to_numpy=True
+        ).astype(np.float32)
+        scores, indices = self.index.search(query_vec, top_k)
+        results = []
+        for rank, (score, idx) in enumerate(zip(scores[0], indices[0])):
+            if idx == -1:
+                continue
+            results.append(SimilarityResult(
+                chunk=self.chunks[idx],
+                score=float(score),
+                rank=rank + 1,
+            ))
+        return results
+    def compare_texts(self, text_a: str, text_b: str) -> float:
+        """
+        Compute cosine similarity between two texts directly.
+        Returns:
+            Similarity score in [-1, 1] (typically [0, 1] for natural language).
+        """
+        vecs = self.model.encode(
+            [text_a, text_b], normalize_embeddings=True, convert_to_tensor=True
+        )
+        return float(util.pytorch_cos_sim(vecs[0], vecs[1]).item())
+    # ------------------------------------------------------------------ #
+    #  Keyword analysis
+    # ------------------------------------------------------------------ #
+    def find_keyword_contexts(
+        self, keyword: str, case_sensitive: bool = False
+    ) -> list[KeywordContext]:
+        """
+        Find all chunks containing a keyword and return them as KeywordContext objects.
+        Args:
+            keyword: The word or phrase to search for.
+            case_sensitive: Whether matching is case-sensitive.
+        Returns:
+            List of KeywordContext with chunk and highlight positions.
+        """
+        if len(keyword) > 200:
+            raise ValueError("Keyword must be 200 characters or fewer.")
+        flags = 0 if case_sensitive else re.IGNORECASE
+        pattern = re.compile(r"\b" + re.escape(keyword) + r"\b", flags)
+        contexts = []
+        for chunk in self.chunks:
+            matches = list(pattern.finditer(chunk.text))
+            if matches:
+                positions = [(m.start(), m.end()) for m in matches]
+                contexts.append(KeywordContext(
+                    keyword=keyword,
+                    chunk=chunk,
+                    highlight_positions=positions,
+                ))
+        return contexts
+    def analyze_keyword(
+        self,
+        keyword: str,
+        top_k: int = 10,
+        cluster_threshold: float = 0.35,
+        case_sensitive: bool = False,
+    ) -> KeywordAnalysis:
+        """
+        Analyze all contextual usages of a keyword across the corpus.
+        Finds every chunk containing the keyword, embeds them, clusters them
+        by semantic similarity (agglomerative clustering), and returns a
+        structured analysis with distinct meaning groups.
+        Args:
+            keyword: Word or phrase to analyze.
+            top_k: Max similar chunks to return per meaning cluster.
+            cluster_threshold: Distance threshold for clustering (lower = more clusters).
+                0.35 works well for clearly distinct meanings; raise to 0.5+ to merge similar ones.
+            case_sensitive: Whether keyword matching is case-sensitive.
+        Returns:
+            KeywordAnalysis with meaning clusters and similarity info.
+        """
+        self._ensure_index()
+        contexts = self.find_keyword_contexts(keyword, case_sensitive)
+        if not contexts:
+            return KeywordAnalysis(keyword=keyword, total_occurrences=0)
+        # Get embeddings for keyword-containing chunks
+        chunk_indices = []
+        for ctx in contexts:
+            idx = self.chunks.index(ctx.chunk)
+            chunk_indices.append(idx)
+        kw_embeddings = self.embeddings[chunk_indices]
+        # Cluster the keyword contexts by semantic similarity
+        clusters = self._cluster_embeddings(kw_embeddings, threshold=cluster_threshold)
+        # Build meaning clusters
+        meaning_clusters = []
+        for cluster_id in sorted(set(clusters)):
+            member_indices = [i for i, c in enumerate(clusters) if c == cluster_id]
+            member_contexts = [contexts[i] for i in member_indices]
+            member_embeds = kw_embeddings[member_indices]
+            # Centroid of this cluster
+            centroid = member_embeds.mean(axis=0, keepdims=True).astype(np.float32)
+            faiss.normalize_L2(centroid)
+            # Find top_k most similar chunks in the full corpus to this meaning
+            scores, idx_arr = self.index.search(centroid, top_k)
+            similar = []
+            for rank, (score, idx) in enumerate(zip(scores[0], idx_arr[0])):
+                if idx == -1:
+                    continue
+                similar.append(SimilarityResult(
+                    chunk=self.chunks[idx],
+                    score=float(score),
+                    rank=rank + 1,
+                ))
+            meaning_clusters.append({
+                "cluster_id": cluster_id,
+                "size": len(member_indices),
+                "representative_text": member_contexts[0].chunk.text[:200],
+                "contexts": member_contexts,
+                "similar_passages": similar,
+            })
+        return KeywordAnalysis(
+            keyword=keyword,
+            total_occurrences=len(contexts),
+            meaning_clusters=meaning_clusters,
+        )
+    def batch_analyze_keywords(
+        self,
+        keywords: list[str],
+        top_k: int = 10,
+        cluster_threshold: float = 0.35,
+        compare_across: bool = True,
+    ) -> dict[str, KeywordAnalysis]:
+        """
+        Analyze multiple keywords and optionally compute cross-keyword similarities.
+        Args:
+            keywords: List of keywords to analyze.
+            top_k: Results per cluster.
+            cluster_threshold: Clustering distance threshold.
+            compare_across: If True, compute pairwise similarity between keyword contexts.
+        Returns:
+            Dict mapping keyword -> KeywordAnalysis.
+        """
+        results = {}
+        for kw in tqdm(keywords, desc="Analyzing keywords"):
+            results[kw] = self.analyze_keyword(kw, top_k, cluster_threshold)
+        if compare_across and len(keywords) > 1:
+            self._compute_cross_keyword_similarities(results)
+        return results
+    def _compute_cross_keyword_similarities(
+        self, analyses: dict[str, KeywordAnalysis]
+    ) -> None:
+        """Compute average cosine similarity between each pair of keywords' contexts."""
+        keyword_centroids = {}
+        for kw, analysis in analyses.items():
+            if not analysis.meaning_clusters:
+                continue
+            # Collect all context embeddings for this keyword
+            all_indices = []
+            for cluster in analysis.meaning_clusters:
+                for ctx in cluster["contexts"]:
+                    idx = self.chunks.index(ctx.chunk)
+                    all_indices.append(idx)
+            if all_indices:
+                embeds = self.embeddings[all_indices]
+                centroid = embeds.mean(axis=0)
+                norm = np.linalg.norm(centroid)
+                if norm > 0:
+                    centroid = centroid / norm
+                keyword_centroids[kw] = centroid
+        # Pairwise similarities
+        kw_list = list(keyword_centroids.keys())
+        for i, kw_a in enumerate(kw_list):
+            sims = {}
+            for j, kw_b in enumerate(kw_list):
+                if i != j:
+                    score = float(np.dot(keyword_centroids[kw_a], keyword_centroids[kw_b]))
+                    sims[kw_b] = score
+            if kw_a in analyses:
+                analyses[kw_a].cross_keyword_similarities = sims
+    # ------------------------------------------------------------------ #
+    #  Contextual keyword matching (the core use case)
+    # ------------------------------------------------------------------ #
+    def match_keyword_to_meaning(
+        self,
+        keyword: str,
+        candidate_meanings: list[str],
+    ) -> list[dict]:
+        """
+        Given a keyword and a list of candidate meanings (words/phrases),
+        find which meaning each occurrence of the keyword is closest to.
+        This is the core "pizza means school" use case: you provide the keyword
+        "pizza" and candidates ["pizza (food)", "school", "homework"], and this
+        method tells you which meaning each usage of "pizza" maps to.
+        Args:
+            keyword: The keyword to analyze (e.g. "pizza").
+            candidate_meanings: List of meaning descriptions (e.g. ["food", "school"]).
+        Returns:
+            List of dicts with keys: chunk, best_match, scores (all candidates).
+        """
+        self._ensure_index()
+        contexts = self.find_keyword_contexts(keyword)
+        if not contexts:
+            return []
+        # Embed all candidate meanings
+        candidate_vecs = self.model.encode(
+            candidate_meanings, normalize_embeddings=True, convert_to_tensor=True
+        )
+        results = []
+        for ctx in contexts:
+            # Embed the chunk containing the keyword
+            chunk_vec = self.model.encode(
+                [ctx.chunk.text], normalize_embeddings=True, convert_to_tensor=True
+            )
+            # Score against each candidate
+            scores = util.pytorch_cos_sim(chunk_vec, candidate_vecs)[0]
+            score_dict = {
+                meaning: float(scores[i]) for i, meaning in enumerate(candidate_meanings)
+            }
+            best = max(score_dict, key=score_dict.get)
+            results.append({
+                "chunk": ctx.chunk,
+                "best_match": best,
+                "best_score": score_dict[best],
+                "all_scores": score_dict,
+            })
+        return results
+    # ------------------------------------------------------------------ #
+    #  Context inference (keyword → meaning words)
+    # ------------------------------------------------------------------ #
+    # Common English stopwords to exclude from context word extraction
+    _STOPWORDS = frozenset(
+        "a an the and or but in on at to for of is it that this was were be been "
+        "being have has had do does did will would shall should may might can could "
+        "not no nor so if then than too very just about above after again all also "
+        "am are as between both by each few from further get got he her here hers "
+        "herself him himself his how i its itself me more most my myself no nor "
+        "only other our ours ourselves out over own same she some such their theirs "
+        "them themselves there these they those through under until up us we what "
+        "when where which while who whom why with you your yours yourself yourselves "
+        "one two three four five six seven eight nine ten into been being because "
+        "during before between against without within along across behind since "
+        "upon around among".split()
+    )
+    def infer_keyword_meanings(
+        self,
+        keyword: str,
+        context_window: int = 120,
+        top_words: int = 8,
+        cluster_threshold: float = 0.35,
+        max_meanings: int = 10,
+    ) -> dict:
+        """
+        Infer what a keyword likely means based on its surrounding context words.
+        Finds all occurrences, clusters them by semantic similarity, then extracts
+        the most distinctive co-occurring words for each meaning cluster.
+        Args:
+            keyword: The keyword to analyze.
+            context_window: Characters around each keyword occurrence to examine.
+            top_words: Number of associated words to return per meaning.
+            cluster_threshold: Distance threshold for clustering.
+            max_meanings: Maximum number of meaning clusters to return.
+        Returns:
+            Dict with keyword, total_occurrences, and meanings list.
+        """
+        self._ensure_index()
+        contexts = self.find_keyword_contexts(keyword)
+        if not contexts:
+            return {
+                "keyword": keyword,
+                "total_occurrences": 0,
+                "meanings": [],
+            }
+        # Get embeddings and cluster
+        chunk_indices = [self.chunks.index(ctx.chunk) for ctx in contexts]
+        kw_embeddings = self.embeddings[chunk_indices]
+        clusters = self._cluster_embeddings(kw_embeddings, threshold=cluster_threshold)
+        total = len(contexts)
+        kw_lower = keyword.lower()
+        word_pattern = re.compile(r"[a-zA-Z]{3,}")
+        # Global word frequencies (across all occurrences) for TF-IDF-like scoring
+        global_word_counts: dict[str, int] = {}
+        cluster_data: dict[int, list[dict[str, int]]] = {}
+        for i, ctx in enumerate(contexts):
+            cluster_id = clusters[i]
+            if cluster_id not in cluster_data:
+                cluster_data[cluster_id] = []
+            # Extract context window around each keyword occurrence
+            local_counts: dict[str, int] = {}
+            for start, end in ctx.highlight_positions:
+                window_start = max(0, start - context_window)
+                window_end = min(len(ctx.chunk.text), end + context_window)
+                window_text = ctx.chunk.text[window_start:window_end].lower()
+                for word_match in word_pattern.finditer(window_text):
+                    w = word_match.group()
+                    if w == kw_lower or w in self._STOPWORDS or len(w) < 3:
+                        continue
+                    local_counts[w] = local_counts.get(w, 0) + 1
+                    global_word_counts[w] = global_word_counts.get(w, 0) + 1
+            cluster_data[cluster_id].append(local_counts)
+        # Build meanings from clusters
+        meanings = []
+        for cluster_id in sorted(cluster_data.keys()):
+            members = cluster_data[cluster_id]
+            count = len(members)
+            confidence = round(count / total, 3)
+            # Aggregate word counts for this cluster
+            cluster_word_counts: dict[str, int] = {}
+            for member_counts in members:
+                for w, c in member_counts.items():
+                    cluster_word_counts[w] = cluster_word_counts.get(w, 0) + c
+            # Score words: cluster frequency weighted by distinctiveness
+            # (how much more frequent in this cluster vs globally)
+            num_clusters = len(cluster_data)
+            word_scores: dict[str, float] = {}
+            for w, cluster_count in cluster_word_counts.items():
+                global_count = global_word_counts.get(w, 1)
+                # TF in cluster * IDF-like distinctiveness
+                tf = cluster_count / max(sum(cluster_word_counts.values()), 1)
+                distinctiveness = (cluster_count / global_count) if num_clusters > 1 else 1.0
+                word_scores[w] = tf * (0.5 + 0.5 * distinctiveness)
+            # Get top words
+            sorted_words = sorted(word_scores.items(), key=lambda x: -x[1])[:top_words]
+            associated_words = [
+                {"word": w, "score": round(s, 4)} for w, s in sorted_words
+            ]
+            # Get example context snippets
+            example_contexts = []
+            member_indices = [j for j, c in enumerate(clusters) if c == cluster_id]
+            for j in member_indices[:3]:  # max 3 examples
+                ctx = contexts[j]
+                if ctx.highlight_positions:
+                    start, end = ctx.highlight_positions[0]
+                    snippet_start = max(0, start - 80)
+                    snippet_end = min(len(ctx.chunk.text), end + 80)
+                    snippet = ctx.chunk.text[snippet_start:snippet_end].strip()
+                    if snippet_start > 0:
+                        snippet = "..." + snippet
+                    if snippet_end < len(ctx.chunk.text):
+                        snippet = snippet + "..."
+                    example_contexts.append({
+                        "doc_id": ctx.chunk.doc_id,
+                        "snippet": snippet,
+                    })
+            meanings.append({
+                "cluster_id": cluster_id,
+                "occurrences": count,
+                "confidence": confidence,
+                "associated_words": associated_words,
+                "example_contexts": example_contexts,
+            })
+        # Sort by confidence descending
+        meanings.sort(key=lambda m: -m["confidence"])
+        meanings = meanings[:max_meanings]
+        return {
+            "keyword": keyword,
+            "total_occurrences": total,
+            "meanings": meanings,
+        }
+    # ------------------------------------------------------------------ #
+    #  Utilities
+    # ------------------------------------------------------------------ #
+    def _cluster_embeddings(
+        self, embeddings: np.ndarray, threshold: float = 0.35
+    ) -> list[int]:
+        """Cluster embeddings using agglomerative clustering with cosine distance."""
+        if len(embeddings) == 1:
+            return [0]
+        clustering = AgglomerativeClustering(
+            n_clusters=None,
+            distance_threshold=threshold,
+            metric="cosine",
+            linkage="average",
+        )
+        labels = clustering.fit_predict(embeddings)
+        return labels.tolist()
+    def similar_words(self, word: str, top_k: int = 10) -> list[dict]:
+        """
+        Find words that appear in similar contexts using transformer embeddings.
+        Extracts unique words from the corpus, encodes them, and finds nearest
+        neighbors by cosine similarity. Unlike Word2Vec (one static vector per word),
+        this uses the transformer's contextual understanding.
+        Args:
+            word: Target word.
+            top_k: Number of similar words to return.
+        Returns:
+            List of {"word": str, "score": float} sorted by descending similarity.
+        """
+        self._ensure_index()
+        word_pattern = re.compile(r"[a-zA-Z]{3,}")
+        word_lower = word.lower()
+        # Collect unique words from corpus (skip stopwords + the query word itself)
+        vocab: set[str] = set()
+        for chunk in self.chunks:
+            for match in word_pattern.finditer(chunk.text):
+                w = match.group().lower()
+                if w != word_lower and w not in self._STOPWORDS:
+                    vocab.add(w)
+        if not vocab:
+            return []
+        vocab_list = sorted(vocab)
+        logger.info("Similar words: encoding %d vocabulary words for '%s'", len(vocab_list), word)
+        # Encode the query word and all vocab words
+        all_texts = [word] + vocab_list
+        embeddings = self.model.encode(
+            all_texts,
+            batch_size=self.batch_size,
+            show_progress_bar=False,
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+        )
+        query_vec = embeddings[0:1]
+        vocab_vecs = embeddings[1:]
+        # Compute cosine similarities
+        scores = (vocab_vecs @ query_vec.T).flatten()
+        top_indices = np.argsort(scores)[::-1][:top_k]
+        return [
+            {"word": vocab_list[i], "score": round(float(scores[i]), 4)}
+            for i in top_indices
+        ]
+    def _ensure_index(self):
+        if self.index is None:
+            raise RuntimeError("Index not built. Call build_index() first.")
+    def get_stats(self) -> dict:
+        """Return corpus statistics."""
+        return {
+            "total_chunks": len(self.chunks),
+            "total_documents": len(self._doc_ids),
+            "document_ids": sorted(self._doc_ids),
+            "index_built": self.index is not None,
+            "embedding_dim": self.embedding_dim,
+            "model_name": self._model_name,
+        }
+    # ------------------------------------------------------------------ #
+    #  Persistence (save / load engine state to disk)
+    # ------------------------------------------------------------------ #
+    def save(self, directory: str) -> dict:
+        """
+        Save the full engine state (chunks, embeddings, FAISS index) to disk.
+        Args:
+            directory: Path to save directory (created if needed).
+        Returns:
+            Stats dict with what was saved.
+        """
+        import json, pickle
+        save_dir = Path(directory)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Save chunks
+        with open(save_dir / "chunks.pkl", "wb") as f:
+            pickle.dump(self.chunks, f)
+        # Save metadata
+        meta = {
+            "model_name": self._model_name,
+            "chunk_size": self.chunk_size,
+            "chunk_overlap": self.chunk_overlap,
+            "batch_size": self.batch_size,
+            "embedding_dim": self.embedding_dim,
+            "doc_ids": sorted(self._doc_ids),
+        }
+        with open(save_dir / "meta.json", "w") as f:
+            json.dump(meta, f, indent=2)
+        # Save embeddings + FAISS index
+        saved_index = False
+        if self.embeddings is not None:
+            np.save(save_dir / "embeddings.npy", self.embeddings)
+        if self.index is not None:
+            faiss.write_index(self.index, str(save_dir / "index.faiss"))
+            saved_index = True
+        logger.info("Engine saved to %s: %d chunks, %d docs, index=%s",
+                     directory, len(self.chunks), len(self._doc_ids), saved_index)
+        return {
+            "directory": str(save_dir),
+            "chunks": len(self.chunks),
+            "documents": len(self._doc_ids),
+            "index_saved": saved_index,
+        }
+    @classmethod
+    def load(cls, directory: str, device: Optional[str] = None) -> "ContextualSimilarityEngine":
+        """
+        Load a previously saved engine state from disk.
+        Args:
+            directory: Path to the saved state directory.
+            device: PyTorch device override.
+        Returns:
+            A fully restored ContextualSimilarityEngine instance.
+        """
+        import json, pickle
+        save_dir = Path(directory)
+        if not save_dir.is_dir():
+            raise FileNotFoundError(f"No saved state at {directory}")
+        # Load metadata
+        with open(save_dir / "meta.json") as f:
+            meta = json.load(f)
+        # Create engine (loads the model)
+        engine = cls(
+            model_name=meta["model_name"],
+            chunk_size=meta["chunk_size"],
+            chunk_overlap=meta["chunk_overlap"],
+            device=device,
+            batch_size=meta["batch_size"],
+        )
+        # Restore chunks
+        with open(save_dir / "chunks.pkl", "rb") as f:
+            engine.chunks = pickle.load(f)
+        engine._doc_ids = set(meta["doc_ids"])
+        # Restore embeddings + index
+        emb_path = save_dir / "embeddings.npy"
+        idx_path = save_dir / "index.faiss"
+        if emb_path.exists():
+            engine.embeddings = np.load(emb_path)
+        if idx_path.exists():
+            engine.index = faiss.read_index(str(idx_path))
+        logger.info("Engine loaded from %s: %d chunks, %d docs, index=%s",
+                     directory, len(engine.chunks), len(engine._doc_ids), engine.index is not None)
+        return engine

data_loader.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""
+Epstein Files Dataset Loader
+Loads data from two HuggingFace sources:
+  1. teyler/epstein-files-20k — raw OCR text (2.1M rows, filename + text)
+  2. devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB — pre-computed
+     all-MiniLM-L6-v2 embeddings in ChromaDB format
+Both can feed directly into the ContextualSimilarityEngine pipeline.
+"""
+import logging
+import re
+import time
+from pathlib import Path
+from typing import Optional
+import numpy as np
+logger = logging.getLogger(__name__)
+# HuggingFace dataset identifiers
+RAW_DATASET = "teyler/epstein-files-20k"
+EMBEDDINGS_DATASET = "devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB"
+def load_raw_dataset(
+    max_docs: Optional[int] = None,
+    min_text_length: int = 100,
+    source_filter: Optional[str] = None,
+) -> list[dict]:
+    """
+    Load raw Epstein Files from HuggingFace.
+    Args:
+        max_docs: Limit number of documents loaded (None = all ~2.1M).
+        min_text_length: Skip documents shorter than this.
+        source_filter: Filter by filename prefix, e.g. "TEXT-" or "IMAGES-".
+    Returns:
+        List of {"doc_id": str, "text": str, "filename": str}
+    """
+    from datasets import load_dataset
+    t0 = time.time()
+    logger.info(f"Loading {RAW_DATASET} from HuggingFace...")
+    ds = load_dataset(RAW_DATASET, split="train")
+    docs = []
+    for i, row in enumerate(ds):
+        if max_docs and len(docs) >= max_docs:
+            break
+        text = (row.get("text") or "").strip()
+        filename = row.get("filename") or f"doc_{i}"
+        if len(text) < min_text_length:
+            continue
+        if source_filter and not filename.startswith(source_filter):
+            continue
+        doc_id = Path(filename).stem
+        docs.append({"doc_id": doc_id, "text": text, "filename": filename})
+    elapsed = time.time() - t0
+    logger.info(f"Loaded {len(docs)} documents in {elapsed:.1f}s")
+    return docs
+def load_raw_to_engine(
+    engine,
+    max_docs: Optional[int] = 500,
+    min_text_length: int = 100,
+    source_filter: Optional[str] = None,
+    build_index: bool = True,
+) -> dict:
+    """
+    Load raw dataset directly into a ContextualSimilarityEngine.
+    Args:
+        engine: ContextualSimilarityEngine instance (must be initialized).
+        max_docs: Limit documents to load.
+        min_text_length: Skip short documents.
+        source_filter: Filter by filename prefix.
+        build_index: Whether to build FAISS index after loading.
+    Returns:
+        Stats dict with counts and timing.
+    """
+    t0 = time.time()
+    docs = load_raw_dataset(max_docs, min_text_length, source_filter)
+    total_chunks = 0
+    skipped = 0
+    for doc in docs:
+        try:
+            chunks = engine.add_document(doc["doc_id"], doc["text"])
+            total_chunks += len(chunks)
+        except ValueError as e:
+            logger.warning("Skipped document '%s': %s", doc["doc_id"], e)
+            skipped += 1
+    if build_index and total_chunks > 0:
+        engine.build_index(show_progress=True)
+    elapsed = time.time() - t0
+    return {
+        "documents_loaded": len(docs) - skipped,
+        "documents_skipped": skipped,
+        "total_chunks": total_chunks,
+        "index_built": build_index and total_chunks > 0,
+        "seconds": round(elapsed, 2),
+    }
+def load_chromadb_embeddings(
+    download_dir: str = "./chroma_epstein",
+) -> dict:
+    """
+    Download and load the pre-computed ChromaDB embeddings.
+    Returns:
+        Dict with "texts", "embeddings", "metadatas", "ids", and stats.
+    """
+    import chromadb
+    from huggingface_hub import snapshot_download
+    t0 = time.time()
+    logger.info(f"Downloading {EMBEDDINGS_DATASET} from HuggingFace...")
+    # This repo contains ChromaDB persistence files (not standard datasets),
+    # so we use snapshot_download instead of load_dataset.
+    local_path = snapshot_download(
+        repo_id=EMBEDDINGS_DATASET,
+        repo_type="dataset",
+        local_dir=download_dir,
+    )
+    # Find the chroma_db directory
+    chroma_dir = None
+    for candidate in [
+        Path(local_path) / "chroma_db",
+        Path(local_path),
+    ]:
+        if (candidate / "chroma.sqlite3").exists():
+            chroma_dir = str(candidate)
+            break
+    if not chroma_dir:
+        raise FileNotFoundError(
+            f"ChromaDB files not found in {local_path}. "
+            f"Expected chroma.sqlite3 in the download."
+        )
+    # Open ChromaDB
+    client = chromadb.PersistentClient(path=chroma_dir)
+    collections = client.list_collections()
+    if not collections:
+        raise ValueError("No collections found in ChromaDB.")
+    collection = collections[0]
+    count = collection.count()
+    logger.info(f"ChromaDB collection '{collection.name}': {count} vectors")
+    elapsed = time.time() - t0
+    return {
+        "chroma_dir": chroma_dir,
+        "collection_name": collection.name,
+        "total_vectors": count,
+        "seconds": round(elapsed, 2),
+        "_collection": collection,
+        "_client": client,
+    }
+def import_chromadb_to_engine(
+    engine,
+    max_chunks: Optional[int] = None,
+    batch_size: int = 1000,
+) -> dict:
+    """
+    Import pre-computed ChromaDB embeddings into the engine's FAISS index.
+    Since both use all-MiniLM-L6-v2 (384-dim), we can directly import
+    the vectors without re-encoding.
+    Args:
+        engine: ContextualSimilarityEngine (must be initialized with all-MiniLM-L6-v2).
+        max_chunks: Limit vectors to import (None = all).
+        batch_size: How many vectors to fetch from ChromaDB at a time.
+    Returns:
+        Stats dict.
+    """
+    t0 = time.time()
+    chroma_data = load_chromadb_embeddings()
+    collection = chroma_data["_collection"]
+    total = chroma_data["total_vectors"]
+    if max_chunks:
+        total = min(total, max_chunks)
+    # Fetch in batches
+    all_texts = []
+    all_embeddings = []
+    all_sources = []
+    offset = 0
+    while offset < total:
+        limit = min(batch_size, total - offset)
+        results = collection.get(
+            limit=limit,
+            offset=offset,
+            include=["embeddings", "documents", "metadatas"],
+        )
+        if not results["ids"]:
+            break
+        for i, doc_id in enumerate(results["ids"]):
+            text = results["documents"][i] if results["documents"] is not None else ""
+            embedding = results["embeddings"][i] if results["embeddings"] is not None else None
+            metadata = results["metadatas"][i] if results["metadatas"] is not None else {}
+            source = metadata.get("source", f"chunk_{offset + i}")
+            if text and embedding is not None:
+                all_texts.append(text)
+                all_embeddings.append(embedding)
+                all_sources.append(source)
+        offset += len(results["ids"])
+        logger.info(f"Fetched {offset}/{total} vectors from ChromaDB")
+    # Group texts by source document and add to engine
+    doc_chunks = {}
+    for text, source in zip(all_texts, all_sources):
+        stem = Path(source).stem if source else "unknown"
+        if stem not in doc_chunks:
+            doc_chunks[stem] = []
+        doc_chunks[stem].append(text)
+    docs_added = 0
+    chunks_added = 0
+    for doc_id, texts in doc_chunks.items():
+        combined = "\n\n".join(texts)
+        try:
+            chunks = engine.add_document(doc_id, combined)
+            chunks_added += len(chunks)
+            docs_added += 1
+        except ValueError as e:
+            logger.warning("Skipped ChromaDB document '%s': %s", doc_id, e)
+    if chunks_added > 0:
+        engine.build_index(show_progress=True)
+    elapsed = time.time() - t0
+    return {
+        "source": "chromadb_embeddings",
+        "chromadb_vectors": len(all_embeddings),
+        "documents_created": docs_added,
+        "chunks_indexed": chunks_added,
+        "index_built": chunks_added > 0,
+        "seconds": round(elapsed, 2),
+    }
+def get_dataset_info() -> dict:
+    """Return metadata about available datasets (no download)."""
+    return {
+        "raw_texts": {
+            "dataset_id": RAW_DATASET,
+            "url": f"https://huggingface.co/datasets/{RAW_DATASET}",
+            "description": "2.1M OCR text documents from U.S. House Oversight Committee Epstein Files release",
+            "columns": ["filename", "text"],
+            "size_mb": 106,
+        },
+        "embeddings": {
+            "dataset_id": EMBEDDINGS_DATASET,
+            "url": f"https://huggingface.co/datasets/{EMBEDDINGS_DATASET}",
+            "description": "Pre-computed all-MiniLM-L6-v2 embeddings in ChromaDB format (~100K+ chunks)",
+            "model": "all-MiniLM-L6-v2",
+            "vector_dim": 384,
+        },
+    }

demo.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+Demo: Word2Vec vs Transformer — side by side comparison.
+Run: python demo.py
+"""
+import json
+from contextual_similarity import ContextualSimilarityEngine
+from word2vec_baseline import Word2VecEngine
+from evaluation import Evaluator, GroundTruthEntry
+# ------------------------------------------------------------------ #
+#  Sample corpus
+# ------------------------------------------------------------------ #
+DOCS = {
+    "secret_language": """
+The kids in the neighborhood had developed their own secret language. When they said
+"pizza" they actually meant "school". So when Tommy said "I love pizza so much, I go
+there every day", he really meant he loved going to school. His friend Sarah would say
+"pizza gives me homework" and everyone in the group understood she was talking about school.
+The code words extended further. "Pepperoni" meant math class, because it was their
+favorite topping but also the hardest subject. When Jake complained about "too much
+pepperoni on my pizza", the group knew he was struggling with math at school.
+Their parents were confused. "Why do you kids talk about pizza all the time?" asked
+Tommy's mom. The kids just giggled. Their secret language was working perfectly.
+""",
+    "real_pizza": """
+Meanwhile, across town, Maria genuinely loved pizza. She worked at Giuseppe's Pizzeria
+and made the best margherita in the city. Her pizza dough recipe used tipo 00 flour,
+San Marzano tomatoes, and fresh mozzarella. Every Saturday, she would fire up the
+wood-burning oven and create masterpieces.
+Maria's customers raved about her pizza. "This pizza is amazing, the crust is perfectly
+crispy!" they would say. The restaurant was always full. Pizza was Maria's life, her
+passion, and her livelihood. She dreamed of opening more pizza restaurants across the country.
+""",
+    "school_board": """
+The local school board met to discuss improving education in the district. Principal
+Johnson presented data showing that students who attended school regularly performed
+better on standardized tests. "School attendance is directly correlated with academic
+success," she explained.
+The board discussed new programs to make school more engaging for students. They proposed
+adding more extracurricular activities, updating the curriculum, and hiring additional
+teachers. "We need to make school a place where students want to be," said board member
+Williams.
+""",
+    "misunderstanding": """
+One day, Tommy's mom overheard a phone conversation. Tommy said to his friend, "I really
+don't want to go to pizza tomorrow. The pizza test is going to be so hard." His mom was
+bewildered - what kind of test does a pizzeria give?
+She called Sarah's mom, who had noticed similar strange statements. "Sarah told me she
+got an A on her pizza report. Since when do pizza places give grades?" The parents
+decided to investigate.
+When they finally figured out the code, they laughed. "So all this time, when you said
+you hated Monday pizza, you meant you hated going to school on Mondays?" Tommy nodded
+sheepishly.
+""",
+}
+COMPARE_PAIRS = [
+    ("I love pizza so much", "I love school so much"),
+    ("pizza gives me homework", "school gives me homework"),
+    ("pizza gives me homework", "fresh mozzarella on pizza"),
+    ("The pizza test is hard", "The school exam is difficult"),
+    ("too much pepperoni on my pizza", "math class is too hard"),
+]
+def main():
+    # ================================================================ #
+    #  Build both engines on the same corpus
+    # ================================================================ #
+    print("=" * 70)
+    print("Loading models...")
+    print("=" * 70)
+    # Transformer engine
+    transformer = ContextualSimilarityEngine(
+        model_name="all-MiniLM-L6-v2",
+        chunk_size=400,
+        chunk_overlap=80,
+    )
+    for doc_id, text in DOCS.items():
+        transformer.add_document(doc_id, text)
+    transformer.build_index(show_progress=False)
+    print(f"Transformer: {transformer.get_stats()['total_chunks']} chunks, "
+          f"dim={transformer.embedding_dim}")
+    # Word2Vec engine
+    w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
+    for doc_id, text in DOCS.items():
+        w2v.add_document(doc_id, text)
+    stats = w2v.build_index()
+    print(f"Word2Vec:    {stats['sentences']} sentences, "
+          f"vocab={stats['vocab_size']}, dim={stats['vector_size']}")
+    # ================================================================ #
+    #  1. Text similarity comparison
+    # ================================================================ #
+    print("\n" + "=" * 70)
+    print("1. TEXT SIMILARITY — same pairs, both models")
+    print("=" * 70)
+    print(f"\n  {'Text A':<35} {'Text B':<35} {'W2V':>6} {'Trans':>6}  {'Winner'}")
+    print("  " + "-" * 95)
+    for a, b in COMPARE_PAIRS:
+        w2v_score = w2v.compare_texts(a, b)
+        tr_score = transformer.compare_texts(a, b)
+        winner = "W2V" if abs(w2v_score) > abs(tr_score) else "TRANS"
+        print(f"  {a:<35} {b:<35} {w2v_score:>6.3f} {tr_score:>6.3f}  {winner}")
+    # ================================================================ #
+    #  2. Word-level similarity (Word2Vec only — transformers don't do this)
+    # ================================================================ #
+    print("\n" + "=" * 70)
+    print("2. WORD-LEVEL SIMILARITY (Word2Vec only)")
+    print("   Word2Vec gives ONE vector per word — no context awareness")
+    print("=" * 70)
+    for word in ["pizza", "school", "homework", "pepperoni"]:
+        similar = w2v.most_similar_words(word, top_k=5)
+        if similar:
+            top = ", ".join(f"{w}({s:.2f})" for w, s in similar)
+            print(f"  {word:>12} -> {top}")
+    print(f"\n  Word2Vec word pairs:")
+    for a, b in [("pizza", "school"), ("pizza", "homework"), ("pizza", "cheese"),
+                  ("school", "homework"), ("pepperoni", "math")]:
+        score = w2v.word_similarity(a, b)
+        print(f"    {a} <-> {b}: {score:.4f}")
+    # ================================================================ #
+    #  3. Semantic search comparison
+    # ================================================================ #
+    print("\n" + "=" * 70)
+    print("3. SEMANTIC SEARCH — 'a place where children learn and take tests'")
+    print("=" * 70)
+    query = "a place where children learn and take tests"
+    print("\n  Transformer results:")
+    for r in transformer.query(query, top_k=3):
+        print(f"    #{r.rank} ({r.score:.4f}) [{r.chunk.doc_id}] {r.chunk.text[:80]}...")
+    print("\n  Word2Vec results:")
+    for r in w2v.query(query, top_k=3):
+        print(f"    #{r.rank} ({r.score:.4f}) [{r.doc_id}] {r.text[:80]}...")
+    # ================================================================ #
+    #  4. The core test: does "pizza" mean "school" or "food"?
+    # ================================================================ #
+    print("\n" + "=" * 70)
+    print("4. KEYWORD MEANING MATCHING — 'pizza' -> food or school?")
+    print("   Transformer uses full passage context. Word2Vec averages word vectors.")
+    print("=" * 70)
+    candidates = [
+        "Italian food, restaurant, cooking, dough and cheese",
+        "School, education, academic activities, homework and tests",
+    ]
+    print("\n  Transformer (match_keyword_to_meaning):")
+    matches = transformer.match_keyword_to_meaning("pizza", candidates)
+    for m in matches:
+        doc = m["chunk"].doc_id
+        best = m["best_match"][:40]
+        scores = " | ".join(f"{c[:20]}={s:.3f}" for c, s in m["all_scores"].items())
+        print(f"    [{doc:>20}] -> {best:<40} ({scores})")
+    print("\n  Word2Vec (sentence-level similarity to candidates):")
+    # Replicate the same logic with Word2Vec
+    import re
+    for doc_id, text in DOCS.items():
+        sents = re.split(r"(?<=[.!?])\s+", text.strip())
+        for sent in sents:
+            if re.search(r"\bpizza\b", sent, re.IGNORECASE) and len(sent.split()) >= 5:
+                scores = {c: w2v.compare_texts(sent, c) for c in candidates}
+                best = max(scores, key=scores.get)
+                best_label = best[:40]
+                score_str = " | ".join(f"{c[:20]}={s:.3f}" for c, s in scores.items())
+                print(f"    [{doc_id:>20}] -> {best_label:<40} ({score_str})")
+                break  # one per doc for brevity
+    # ================================================================ #
+    #  5. Clustering comparison
+    # ================================================================ #
+    print("\n" + "=" * 70)
+    print("5. KEYWORD CLUSTERING — can the model separate meanings of 'pizza'?")
+    print("=" * 70)
+    analysis = transformer.analyze_keyword("pizza", top_k=2, cluster_threshold=0.4)
+    print(f"\n  Transformer: {analysis.total_occurrences} occurrences -> "
+          f"{len(analysis.meaning_clusters)} clusters")
+    for c in analysis.meaning_clusters:
+        docs = set(ctx.chunk.doc_id for ctx in c["contexts"])
+        print(f"    Cluster {c['cluster_id']} ({c['size']} hits, docs: {docs})")
+        print(f"      Example: {c['representative_text'][:100]}...")
+    print(f"\n  Word2Vec: cannot cluster by meaning (same word = same vector always)")
+    print(f"    'pizza' has exactly ONE embedding regardless of context")
+    # ================================================================ #
+    #  Summary
+    # ================================================================ #
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+    print("""
+  Word2Vec:
+    + Fast to train on small corpus
+    + Shows which words co-occur (word-level neighbors)
+    - ONE vector per word — "pizza" is always "pizza"
+    - Cannot distinguish "pizza = food" from "pizza = school"
+    - Sentence similarity is just averaged word vectors (lossy)
+  Transformer (SentenceTransformers):
+    + Full sentence/passage context — same word gets different embeddings
+    + Can cluster "pizza" into food vs school meanings
+    + Pretrained on massive data — understands language out of the box
+    + FAISS enables fast search over large corpora
+    - Larger model (~80MB vs ~1MB for Word2Vec)
+    - Slower inference (still <100ms per query)
+""")
+if __name__ == "__main__":
+    main()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+services:
+  app:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      # Persist HuggingFace model cache between restarts
+      - hf-cache:/data/huggingface
+      # Persist engine state and trained models
+      - engine-state:/data/engine_state
+      - ./trained_model:/data/trained_model
+    environment:
+      - HOST=0.0.0.0
+      - PORT=8000
+volumes:
+  hf-cache:
+  engine-state:

evaluation.py ADDED Viewed

	@@ -0,0 +1,547 @@

+"""
+Evaluation Pipeline for Contextual Similarity Engine
+Provides metrics and benchmarks to assess the quality of contextual
+keyword matching:
+  - Cosine similarity distributions
+  - Precision@K and Recall@K for retrieval
+  - Normalized Mutual Information (NMI) for clustering quality
+  - Mean Reciprocal Rank (MRR) for ranking quality
+  - Keyword disambiguation accuracy against ground truth
+  - Full evaluation reports with summary statistics
+"""
+import json
+import logging
+import time
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Optional
+import numpy as np
+from sklearn.metrics import (
+    normalized_mutual_info_score,
+    adjusted_rand_score,
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix,
+)
+from contextual_similarity import ContextualSimilarityEngine, KeywordAnalysis
+logger = logging.getLogger(__name__)
+# ------------------------------------------------------------------ #
+#  Data structures
+# ------------------------------------------------------------------ #
+@dataclass
+class GroundTruthEntry:
+    """A single labeled keyword occurrence for evaluation."""
+    keyword: str
+    text: str  # The passage/sentence containing the keyword
+    true_meaning: str  # The actual intended meaning label
+@dataclass
+class RetrievalMetrics:
+    """Metrics for a single retrieval query."""
+    query: str
+    precision_at_k: dict[int, float] = field(default_factory=dict)  # k -> P@k
+    recall_at_k: dict[int, float] = field(default_factory=dict)  # k -> R@k
+    mrr: float = 0.0  # Mean Reciprocal Rank
+    ndcg_at_k: dict[int, float] = field(default_factory=dict)  # k -> NDCG@k
+    avg_similarity: float = 0.0
+    top_score: float = 0.0
+@dataclass
+class ClusteringMetrics:
+    """Metrics for clustering quality against ground truth."""
+    keyword: str
+    nmi: float = 0.0  # Normalized Mutual Information
+    ari: float = 0.0  # Adjusted Rand Index
+    num_predicted_clusters: int = 0
+    num_true_clusters: int = 0
+    cluster_sizes: list[int] = field(default_factory=list)
+@dataclass
+class DisambiguationMetrics:
+    """Metrics for keyword meaning disambiguation."""
+    keyword: str
+    accuracy: float = 0.0
+    weighted_f1: float = 0.0
+    per_meaning_precision: dict[str, float] = field(default_factory=dict)
+    per_meaning_recall: dict[str, float] = field(default_factory=dict)
+    per_meaning_f1: dict[str, float] = field(default_factory=dict)
+    confusion: Optional[list] = None  # confusion matrix as nested list
+    total_samples: int = 0
+@dataclass
+class EvaluationReport:
+    """Complete evaluation report."""
+    timestamp: str = ""
+    model_name: str = ""
+    corpus_stats: dict = field(default_factory=dict)
+    retrieval_metrics: list[RetrievalMetrics] = field(default_factory=list)
+    clustering_metrics: list[ClusteringMetrics] = field(default_factory=list)
+    disambiguation_metrics: list[DisambiguationMetrics] = field(default_factory=list)
+    similarity_distribution: dict = field(default_factory=dict)
+    timing: dict = field(default_factory=dict)
+    def summary(self) -> dict:
+        """Return a concise summary of the evaluation."""
+        summary = {
+            "model": self.model_name,
+            "corpus": self.corpus_stats,
+            "timing": self.timing,
+        }
+        if self.retrieval_metrics:
+            avg_mrr = float(np.mean([m.mrr for m in self.retrieval_metrics]))
+            avg_p5 = float(np.mean([m.precision_at_k.get(5, 0) for m in self.retrieval_metrics]))
+            avg_p10 = float(np.mean([m.precision_at_k.get(10, 0) for m in self.retrieval_metrics]))
+            summary["retrieval"] = {
+                "mean_mrr": round(avg_mrr, 4),
+                "mean_precision_at_5": round(avg_p5, 4),
+                "mean_precision_at_10": round(avg_p10, 4),
+                "num_queries": len(self.retrieval_metrics),
+            }
+        if self.clustering_metrics:
+            avg_nmi = float(np.mean([m.nmi for m in self.clustering_metrics]))
+            avg_ari = float(np.mean([m.ari for m in self.clustering_metrics]))
+            summary["clustering"] = {
+                "mean_nmi": round(avg_nmi, 4),
+                "mean_ari": round(avg_ari, 4),
+                "num_keywords": len(self.clustering_metrics),
+            }
+        if self.disambiguation_metrics:
+            avg_acc = float(np.mean([m.accuracy for m in self.disambiguation_metrics]))
+            avg_f1 = float(np.mean([m.weighted_f1 for m in self.disambiguation_metrics]))
+            summary["disambiguation"] = {
+                "mean_accuracy": round(avg_acc, 4),
+                "mean_weighted_f1": round(avg_f1, 4),
+                "num_keywords": len(self.disambiguation_metrics),
+            }
+        if self.similarity_distribution:
+            summary["similarity_distribution"] = self.similarity_distribution
+        return summary
+    def to_json(self, indent: int = 2) -> str:
+        """Serialize the full report to JSON."""
+        return json.dumps(asdict(self), indent=indent, default=str)
+    def save(self, path: str) -> None:
+        """Save the report to a JSON file."""
+        Path(path).write_text(self.to_json())
+        logger.info(f"Evaluation report saved to {path}")
+# ------------------------------------------------------------------ #
+#  Evaluator
+# ------------------------------------------------------------------ #
+class Evaluator:
+    """
+    Evaluation pipeline for the ContextualSimilarityEngine.
+    Usage:
+        engine = ContextualSimilarityEngine()
+        engine.add_document("doc1", text)
+        engine.build_index()
+        evaluator = Evaluator(engine)
+        # Evaluate retrieval quality
+        evaluator.evaluate_retrieval(queries_with_relevance)
+        # Evaluate keyword disambiguation
+        evaluator.evaluate_disambiguation(ground_truth, candidate_meanings)
+        # Evaluate clustering
+        evaluator.evaluate_clustering(ground_truth)
+        # Get full report
+        report = evaluator.get_report()
+    """
+    def __init__(self, engine: ContextualSimilarityEngine):
+        self.engine = engine
+        self._report = EvaluationReport(
+            timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
+            model_name=engine._model_name,
+            corpus_stats=engine.get_stats(),
+        )
+    # ------------------------------------------------------------------ #
+    #  Retrieval evaluation
+    # ------------------------------------------------------------------ #
+    def evaluate_retrieval(
+        self,
+        queries: list[dict],
+        k_values: list[int] = None,
+    ) -> list[RetrievalMetrics]:
+        """
+        Evaluate retrieval quality given labeled queries.
+        Args:
+            queries: List of dicts with keys:
+                - "query": str, the query text
+                - "relevant_doc_ids": list[str], doc IDs that are relevant
+                  OR
+                - "relevant_texts": list[str], text snippets considered relevant
+            k_values: List of K values for P@K, R@K, NDCG@K.
+        Returns:
+            List of RetrievalMetrics, one per query.
+        """
+        if k_values is None:
+            k_values = [1, 3, 5, 10]
+        t0 = time.time()
+        all_metrics = []
+        for q in queries:
+            query_text = q["query"]
+            max_k = max(k_values)
+            results = self.engine.query(query_text, top_k=max_k)
+            # Determine relevance for each result
+            relevant_doc_ids = set(q.get("relevant_doc_ids", []))
+            relevant_texts = set(q.get("relevant_texts", []))
+            def is_relevant(result):
+                if relevant_doc_ids and result.chunk.doc_id in relevant_doc_ids:
+                    return True
+                if relevant_texts:
+                    return any(rt.lower() in result.chunk.text.lower() for rt in relevant_texts)
+                return False
+            relevance = [is_relevant(r) for r in results]
+            scores = [r.score for r in results]
+            metrics = RetrievalMetrics(query=query_text)
+            # P@K and R@K
+            total_relevant = sum(relevance)
+            for k in k_values:
+                top_k_rel = relevance[:k]
+                metrics.precision_at_k[k] = sum(top_k_rel) / k if k > 0 else 0
+                metrics.recall_at_k[k] = (
+                    sum(top_k_rel) / total_relevant if total_relevant > 0 else 0
+                )
+                metrics.ndcg_at_k[k] = self._compute_ndcg(relevance[:k], k)
+            # MRR
+            for i, rel in enumerate(relevance):
+                if rel:
+                    metrics.mrr = 1.0 / (i + 1)
+                    break
+            metrics.avg_similarity = float(np.mean(scores)) if scores else 0.0
+            metrics.top_score = float(scores[0]) if scores else 0.0
+            all_metrics.append(metrics)
+        elapsed = time.time() - t0
+        self._report.retrieval_metrics = all_metrics
+        self._report.timing["retrieval_eval_seconds"] = round(elapsed, 3)
+        return all_metrics
+    @staticmethod
+    def _compute_ndcg(relevance: list[bool], k: int) -> float:
+        """Compute NDCG@K for binary relevance."""
+        dcg = sum(
+            (1 if rel else 0) / np.log2(i + 2)
+            for i, rel in enumerate(relevance[:k])
+        )
+        # Ideal: all relevant items first
+        ideal = sorted(relevance[:k], reverse=True)
+        idcg = sum(
+            (1 if rel else 0) / np.log2(i + 2)
+            for i, rel in enumerate(ideal)
+        )
+        return dcg / idcg if idcg > 0 else 0.0
+    # ------------------------------------------------------------------ #
+    #  Clustering evaluation
+    # ------------------------------------------------------------------ #
+    def evaluate_clustering(
+        self,
+        ground_truth: list[GroundTruthEntry],
+        cluster_threshold: float = 0.35,
+    ) -> list[ClusteringMetrics]:
+        """
+        Evaluate clustering quality by comparing engine's auto-clusters
+        against ground truth meaning labels.
+        Args:
+            ground_truth: Labeled entries with keyword, text, and true_meaning.
+            cluster_threshold: Threshold for agglomerative clustering.
+        Returns:
+            List of ClusteringMetrics, one per keyword.
+        """
+        t0 = time.time()
+        # Group ground truth by keyword
+        by_keyword: dict[str, list[GroundTruthEntry]] = {}
+        for entry in ground_truth:
+            by_keyword.setdefault(entry.keyword, []).append(entry)
+        all_metrics = []
+        for keyword, entries in by_keyword.items():
+            analysis = self.engine.analyze_keyword(
+                keyword, cluster_threshold=cluster_threshold
+            )
+            if not analysis.meaning_clusters:
+                all_metrics.append(ClusteringMetrics(keyword=keyword))
+                continue
+            # Map ground truth entries to predicted clusters
+            true_labels = []
+            pred_labels = []
+            meaning_to_id = {}
+            for entry in entries:
+                # Assign numeric ID to each true meaning
+                if entry.true_meaning not in meaning_to_id:
+                    meaning_to_id[entry.true_meaning] = len(meaning_to_id)
+                true_labels.append(meaning_to_id[entry.true_meaning])
+                # Find which cluster this entry's text belongs to
+                best_cluster = -1
+                best_sim = -1
+                entry_vec = self.engine.model.encode(
+                    [entry.text], normalize_embeddings=True, convert_to_numpy=True
+                )
+                for cluster in analysis.meaning_clusters:
+                    for ctx in cluster["contexts"]:
+                        idx = self.engine.chunks.index(ctx.chunk)
+                        sim = float(np.dot(entry_vec[0], self.engine.embeddings[idx]))
+                        if sim > best_sim:
+                            best_sim = sim
+                            best_cluster = cluster["cluster_id"]
+                pred_labels.append(best_cluster)
+            metrics = ClusteringMetrics(
+                keyword=keyword,
+                nmi=normalized_mutual_info_score(true_labels, pred_labels),
+                ari=adjusted_rand_score(true_labels, pred_labels),
+                num_predicted_clusters=len(analysis.meaning_clusters),
+                num_true_clusters=len(meaning_to_id),
+                cluster_sizes=[c["size"] for c in analysis.meaning_clusters],
+            )
+            all_metrics.append(metrics)
+        elapsed = time.time() - t0
+        self._report.clustering_metrics = all_metrics
+        self._report.timing["clustering_eval_seconds"] = round(elapsed, 3)
+        return all_metrics
+    # ------------------------------------------------------------------ #
+    #  Disambiguation evaluation
+    # ------------------------------------------------------------------ #
+    def evaluate_disambiguation(
+        self,
+        ground_truth: list[GroundTruthEntry],
+        candidate_meanings: dict[str, list[str]],
+    ) -> list[DisambiguationMetrics]:
+        """
+        Evaluate keyword meaning disambiguation accuracy.
+        For each ground truth entry, uses match_keyword_to_meaning() and compares
+        the predicted best match against the true label.
+        Args:
+            ground_truth: Labeled entries with keyword, text, and true_meaning.
+            candidate_meanings: Dict mapping keyword -> list of candidate meaning strings.
+                Each candidate should be a descriptive phrase, e.g. {"pizza": ["food", "school"]}.
+        Returns:
+            List of DisambiguationMetrics, one per keyword.
+        """
+        t0 = time.time()
+        by_keyword: dict[str, list[GroundTruthEntry]] = {}
+        for entry in ground_truth:
+            by_keyword.setdefault(entry.keyword, []).append(entry)
+        all_metrics = []
+        for keyword, entries in by_keyword.items():
+            candidates = candidate_meanings.get(keyword, [])
+            if not candidates:
+                logger.warning(f"No candidate meanings for '{keyword}', skipping.")
+                continue
+            true_labels = []
+            pred_labels = []
+            for entry in entries:
+                # Encode the entry text and score against each candidate
+                entry_vec = self.engine.model.encode(
+                    [entry.text], normalize_embeddings=True, convert_to_tensor=True
+                )
+                cand_vecs = self.engine.model.encode(
+                    candidates, normalize_embeddings=True, convert_to_tensor=True
+                )
+                from sentence_transformers import util as st_util
+                scores = st_util.pytorch_cos_sim(entry_vec, cand_vecs)[0]
+                best_idx = int(scores.argmax())
+                predicted = candidates[best_idx]
+                true_labels.append(entry.true_meaning)
+                pred_labels.append(predicted)
+            # Compute metrics
+            unique_labels = sorted(set(true_labels + pred_labels))
+            accuracy = sum(t == p for t, p in zip(true_labels, pred_labels)) / len(true_labels)
+            # Per-meaning precision, recall, F1
+            per_meaning_p = {}
+            per_meaning_r = {}
+            per_meaning_f = {}
+            for label in unique_labels:
+                t_binary = [1 if t == label else 0 for t in true_labels]
+                p_binary = [1 if p == label else 0 for p in pred_labels]
+                p_val = precision_score(t_binary, p_binary, zero_division=0)
+                r_val = recall_score(t_binary, p_binary, zero_division=0)
+                f_val = f1_score(t_binary, p_binary, zero_division=0)
+                per_meaning_p[label] = round(p_val, 4)
+                per_meaning_r[label] = round(r_val, 4)
+                per_meaning_f[label] = round(f_val, 4)
+            weighted_f = f1_score(
+                true_labels, pred_labels, average="weighted", zero_division=0
+            )
+            cm = confusion_matrix(true_labels, pred_labels, labels=unique_labels)
+            metrics = DisambiguationMetrics(
+                keyword=keyword,
+                accuracy=round(accuracy, 4),
+                weighted_f1=round(weighted_f, 4),
+                per_meaning_precision=per_meaning_p,
+                per_meaning_recall=per_meaning_r,
+                per_meaning_f1=per_meaning_f,
+                confusion=cm.tolist(),
+                total_samples=len(entries),
+            )
+            all_metrics.append(metrics)
+        elapsed = time.time() - t0
+        self._report.disambiguation_metrics = all_metrics
+        self._report.timing["disambiguation_eval_seconds"] = round(elapsed, 3)
+        return all_metrics
+    # ------------------------------------------------------------------ #
+    #  Similarity distribution analysis
+    # ------------------------------------------------------------------ #
+    def analyze_similarity_distribution(
+        self, sample_size: int = 1000, seed: int = 42
+    ) -> dict:
+        """
+        Analyze the distribution of pairwise similarities in the corpus.
+        Useful for calibrating thresholds and understanding embedding space.
+        Returns:
+            Dict with mean, std, percentiles, and histogram data.
+        """
+        self.engine._ensure_index()
+        n = len(self.engine.chunks)
+        rng = np.random.RandomState(seed)
+        # Sample random pairs
+        actual_sample = min(sample_size, n * (n - 1) // 2)
+        pairs_i = rng.randint(0, n, size=actual_sample)
+        pairs_j = rng.randint(0, n, size=actual_sample)
+        # Avoid self-pairs
+        mask = pairs_i != pairs_j
+        pairs_i, pairs_j = pairs_i[mask], pairs_j[mask]
+        sims = np.sum(
+            self.engine.embeddings[pairs_i] * self.engine.embeddings[pairs_j], axis=1
+        )
+        percentiles = {
+            str(p): round(float(np.percentile(sims, p)), 4)
+            for p in [5, 10, 25, 50, 75, 90, 95]
+        }
+        # Histogram
+        hist, bin_edges = np.histogram(sims, bins=20, range=(-1, 1))
+        histogram = [
+            {"bin_start": round(float(bin_edges[i]), 3), "bin_end": round(float(bin_edges[i + 1]), 3), "count": int(hist[i])}
+            for i in range(len(hist))
+        ]
+        dist_info = {
+            "sample_size": int(len(sims)),
+            "mean": round(float(np.mean(sims)), 4),
+            "std": round(float(np.std(sims)), 4),
+            "min": round(float(np.min(sims)), 4),
+            "max": round(float(np.max(sims)), 4),
+            "percentiles": percentiles,
+            "histogram": histogram,
+        }
+        self._report.similarity_distribution = dist_info
+        return dist_info
+    # ------------------------------------------------------------------ #
+    #  Full evaluation
+    # ------------------------------------------------------------------ #
+    def run_full_evaluation(
+        self,
+        ground_truth: Optional[list[GroundTruthEntry]] = None,
+        candidate_meanings: Optional[dict[str, list[str]]] = None,
+        retrieval_queries: Optional[list[dict]] = None,
+        cluster_threshold: float = 0.35,
+    ) -> EvaluationReport:
+        """
+        Run the complete evaluation pipeline.
+        Args:
+            ground_truth: Labeled data for clustering and disambiguation eval.
+            candidate_meanings: Keyword -> candidate meanings for disambiguation.
+            retrieval_queries: Labeled queries for retrieval eval.
+            cluster_threshold: Clustering distance threshold.
+        Returns:
+            Full EvaluationReport.
+        """
+        logger.info("Running full evaluation pipeline...")
+        t0 = time.time()
+        # Always compute similarity distribution
+        self.analyze_similarity_distribution()
+        if retrieval_queries:
+            self.evaluate_retrieval(retrieval_queries)
+        if ground_truth:
+            self.evaluate_clustering(ground_truth, cluster_threshold)
+            if candidate_meanings:
+                self.evaluate_disambiguation(ground_truth, candidate_meanings)
+        self._report.timing["total_eval_seconds"] = round(time.time() - t0, 3)
+        logger.info("Evaluation complete.")
+        return self._report
+    def get_report(self) -> EvaluationReport:
+        """Return the current evaluation report."""
+        return self._report

frontend/.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

frontend/README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# React + Vite
+This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
+Currently, two official plugins are available:
+- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) (or [oxc](https://oxc.rs) when used in [rolldown-vite](https://vite.dev/guide/rolldown)) for Fast Refresh
+- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
+## React Compiler
+The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
+## Expanding the ESLint configuration
+If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.

frontend/eslint.config.js ADDED Viewed

	@@ -0,0 +1,29 @@

+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import { defineConfig, globalIgnores } from 'eslint/config'
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{js,jsx}'],
+    extends: [
+      js.configs.recommended,
+      reactHooks.configs.flat.recommended,
+      reactRefresh.configs.vite,
+    ],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+      parserOptions: {
+        ecmaVersion: 'latest',
+        ecmaFeatures: { jsx: true },
+        sourceType: 'module',
+      },
+    },
+    rules: {
+      'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],
+    },
+  },
+])

frontend/index.html ADDED Viewed

	@@ -0,0 +1,12 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Contextual Similarity Engine</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>

frontend/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

frontend/package.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "name": "contextual-similarity-ui",
+  "private": true,
+  "version": "1.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc -b && vite build",
+    "lint": "eslint .",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "axios": "^1.13.6",
+    "react": "^19.2.4",
+    "react-dom": "^19.2.4",
+    "recharts": "^3.8.0"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.39.4",
+    "@types/react": "^19.2.14",
+    "@types/react-dom": "^19.2.3",
+    "@vitejs/plugin-react": "^5.1.4",
+    "eslint": "^9.39.4",
+    "eslint-plugin-react-hooks": "^7.0.1",
+    "eslint-plugin-react-refresh": "^0.5.2",
+    "globals": "^17.4.0",
+    "typescript": "~5.9.3",
+    "vite": "^7.3.1"
+  }
+}

frontend/public/vite.svg ADDED Viewed

frontend/src/App.tsx ADDED Viewed

	@@ -0,0 +1,182 @@

+import { useState, useEffect, Fragment } from "react";
+import type { CorpusStats } from "./types";
+import { api, checkConnection } from "./api";
+import TrainingPanel from "./components/TrainingPanel";
+import EngineSetup from "./components/EngineSetup";
+import SemanticSearch from "./components/SemanticSearch";
+import TextCompare from "./components/TextCompare";
+import KeywordAnalysis from "./components/KeywordAnalysis";
+import KeywordMatcher from "./components/KeywordMatcher";
+import BatchAnalysis from "./components/BatchAnalysis";
+import SimilarWords from "./components/SimilarWords";
+import ContextAnalysis from "./components/ContextAnalysis";
+import EvaluationDashboard from "./components/EvaluationDashboard";
+import Word2VecPanel from "./components/Word2VecPanel";
+import DatasetPanel from "./components/DatasetPanel";
+import "./styles.css";
+type NavGroup = "data" | "training" | "analysis" | "evaluation";
+type TrainingTab = "model" | "w2v";
+type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
+const STEPS: { id: NavGroup; label: string; needsIndex?: boolean }[] = [
+  { id: "data", label: "Data & Setup" },
+  { id: "training", label: "Training" },
+  { id: "analysis", label: "Analysis", needsIndex: true },
+  { id: "evaluation", label: "Evaluation", needsIndex: true },
+];
+const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
+  { id: "model", label: "Fine-tune Model" },
+  { id: "w2v", label: "Word2Vec Baseline" },
+];
+const ANALYSIS_TABS: { id: AnalysisTab; label: string }[] = [
+  { id: "context", label: "Context" },
+  { id: "words", label: "Similar Words" },
+  { id: "search", label: "Search" },
+  { id: "compare", label: "Compare" },
+  { id: "keyword", label: "Keywords" },
+  { id: "match", label: "Matcher" },
+  { id: "batch", label: "Batch" },
+];
+export default function App() {
+  const [group, setGroup] = useState<NavGroup>("data");
+  const [trainingTab, setTrainingTab] = useState<TrainingTab>("model");
+  const [analysisTab, setAnalysisTab] = useState<AnalysisTab>("context");
+  const [stats, setStats] = useState<CorpusStats | null>(null);
+  const [showManualSetup, setShowManualSetup] = useState(false);
+  const [serverError, setServerError] = useState<string | null>(null);
+  const ready = stats !== null && stats.index_built;
+  useEffect(() => {
+    checkConnection().then((err) => {
+      setServerError(err);
+      // If server is up, try to fetch stats (engine may have been auto-restored)
+      if (!err) {
+        api.getStats().then(setStats).catch(() => {});
+      }
+    });
+    const interval = setInterval(() => {
+      checkConnection().then(setServerError);
+    }, 15000);
+    return () => clearInterval(interval);
+  }, []);
+  function handleStepClick(id: NavGroup, needsIndex?: boolean) {
+    if (needsIndex && !ready) return;
+    setGroup(id);
+  }
+  return (
+    <div className="app">
+      <header className="app-header">
+        <h1>Contextual Similarity Engine</h1>
+        {stats && (
+          <div className="header-stats">
+            <span className="badge">{stats.model_name}</span>
+            <span className="badge">{stats.total_documents} docs</span>
+            <span className="badge">{stats.total_chunks} chunks</span>
+            <span className={`badge ${stats.index_built ? "badge-ok" : "badge-warn"}`}>
+              {stats.index_built ? "Index ready" : "Index not built"}
+            </span>
+          </div>
+        )}
+      </header>
+      {serverError && (
+        <div className="server-error-banner">
+          <strong>Server unavailable:</strong> {serverError}
+        </div>
+      )}
+      {/* Progress Stepper (serves as main navigation) */}
+      <nav className="stepper">
+        {STEPS.map((step, i) => {
+          const disabled = step.needsIndex && !ready;
+          const active = group === step.id;
+          const done = step.id === "data" && ready;
+          return (
+            <Fragment key={step.id}>
+              {i > 0 && (
+                <div className={`stepper-line ${!disabled ? "stepper-line-active" : ""}`} />
+              )}
+              <div className="stepper-item">
+                <button
+                  className={`stepper-circle ${active ? "stepper-active" : ""} ${done && !active ? "stepper-done" : ""}`}
+                  onClick={() => handleStepClick(step.id, step.needsIndex)}
+                  disabled={disabled}
+                >
+                  {done && !active ? "\u2713" : i + 1}
+                </button>
+                <span className={`stepper-label ${active ? "stepper-label-active" : ""}`}>
+                  {step.label}
+                </span>
+              </div>
+            </Fragment>
+          );
+        })}
+      </nav>
+      {/* Sub-tabs for groups with multiple views */}
+      {group === "training" && (
+        <nav className="subtabs">
+          {TRAINING_TABS.map((t) => (
+            <button
+              key={t.id}
+              className={`subtab ${trainingTab === t.id ? "subtab-active" : ""}`}
+              onClick={() => setTrainingTab(t.id)}
+            >
+              {t.label}
+            </button>
+          ))}
+        </nav>
+      )}
+      {group === "analysis" && (
+        <nav className="subtabs">
+          {ANALYSIS_TABS.map((t) => (
+            <button
+              key={t.id}
+              className={`subtab ${analysisTab === t.id ? "subtab-active" : ""}`}
+              onClick={() => setAnalysisTab(t.id)}
+            >
+              {t.label}
+            </button>
+          ))}
+        </nav>
+      )}
+      {/* Content */}
+      <main className="content">
+        {group === "data" && (
+          <>
+            <DatasetPanel onStatsUpdate={setStats} />
+            <button
+              className="collapsible-toggle"
+              onClick={() => setShowManualSetup(!showManualSetup)}
+            >
+              <span className="collapsible-arrow">{showManualSetup ? "\u25be" : "\u25b8"}</span>
+              Or add documents manually
+            </button>
+            {showManualSetup && <EngineSetup onStatsUpdate={setStats} />}
+          </>
+        )}
+        {group === "training" && trainingTab === "model" && <TrainingPanel />}
+        {group === "training" && trainingTab === "w2v" && <Word2VecPanel />}
+        {group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
+        {group === "analysis" && analysisTab === "words" && <SimilarWords />}
+        {group === "analysis" && analysisTab === "search" && <SemanticSearch />}
+        {group === "analysis" && analysisTab === "compare" && <TextCompare />}
+        {group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
+        {group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
+        {group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
+        {group === "evaluation" && <EvaluationDashboard />}
+      </main>
+    </div>
+  );
+}

frontend/src/api.ts ADDED Viewed

	@@ -0,0 +1,144 @@

+import axios from "axios";
+import type {
+  InitRequest, InitResponse, DocumentRequest, AddDocResponse, BuildIndexResponse,
+  QueryRequest, QueryResponse, CompareRequest, CompareResponse,
+  KeywordAnalysisRequest, KeywordAnalysisResponse,
+  KeywordMatchRequest, MatchResponse, BatchAnalysisRequest,
+  CorpusStats, SimilarityDistribution, DisambiguationMetric, RetrievalMetric,
+  TrainResponse, TrainEvalResponse,
+  W2VInitResponse, W2VQueryResult, W2VSimilarWord,
+  DatasetInfo, DatasetLoadRequest, DatasetLoadResponse, DatasetPreviewResponse,
+  ContextAnalysisResponse,
+} from "./types";
+const client = axios.create({ baseURL: "/api" });
+const long = { timeout: 600000 };
+/** Extract a human-readable error message from an Axios error. */
+export function getErrorMessage(err: unknown): string {
+  if (axios.isAxiosError(err)) {
+    if (err.code === "ECONNABORTED") return "Request timed out. The server may be busy.";
+    if (!err.response) return "Cannot connect to server. Is it running? (uv run python server.py)";
+    const detail = err.response.data?.detail;
+    if (typeof detail === "string") return detail;
+    if (typeof err.response.data === "string") return err.response.data;
+    return `Server error (${err.response.status})`;
+  }
+  if (err instanceof Error) return err.message;
+  return "An unexpected error occurred.";
+}
+/** Check if the backend is reachable. Returns null on success or an error message. */
+export async function checkConnection(): Promise<string | null> {
+  try {
+    await client.get("/stats", { timeout: 5000 });
+    return null;
+  } catch (err) {
+    if (axios.isAxiosError(err) && err.response?.status === 400) {
+      // 400 = "Engine not initialized" — server is up, just no engine yet
+      return null;
+    }
+    return getErrorMessage(err);
+  }
+}
+/** Shared shape for all training requests (matches server TrainRequest). */
+interface TrainRequestData {
+  corpus_texts: string[];
+  base_model: string;
+  output_path: string;
+  epochs: number;
+  batch_size: number;
+}
+export const api = {
+  // ---- Training ----
+  trainUnsupervised: (data: TrainRequestData) =>
+    client.post<TrainResponse>("/train/unsupervised", data, long).then(r => r.data),
+  trainContrastive: (data: TrainRequestData) =>
+    client.post<TrainResponse>("/train/contrastive", data, long).then(r => r.data),
+  trainKeywords: (data: TrainRequestData & { keyword_meanings: Record<string, string> }) =>
+    client.post<TrainResponse>("/train/keywords", data, long).then(r => r.data),
+  trainEvaluate: (data: { test_pairs: { text_a: string; text_b: string; expected: number }[]; trained_model_path: string; base_model: string; corpus_texts: string[] }) =>
+    client.post<TrainEvalResponse>("/train/evaluate", data).then(r => r.data),
+  // ---- Engine ----
+  init: (data: InitRequest) =>
+    client.post<InitResponse>("/init", data).then(r => r.data),
+  addDocument: (data: DocumentRequest) =>
+    client.post<AddDocResponse>("/documents", data).then(r => r.data),
+  buildIndex: () =>
+    client.post<BuildIndexResponse>("/index/build").then(r => r.data),
+  query: (data: QueryRequest) =>
+    client.post<QueryResponse>("/query", data).then(r => r.data),
+  compare: (data: CompareRequest) =>
+    client.post<CompareResponse>("/compare", data).then(r => r.data),
+  analyzeKeyword: (data: KeywordAnalysisRequest) =>
+    client.post<KeywordAnalysisResponse>("/analyze/keyword", data).then(r => r.data),
+  batchAnalyze: (data: BatchAnalysisRequest) =>
+    client.post<Record<string, KeywordAnalysisResponse>>("/analyze/batch", data).then(r => r.data),
+  matchKeyword: (data: KeywordMatchRequest) =>
+    client.post<MatchResponse>("/match", data).then(r => r.data),
+  analyzeContext: (data: { keyword: string; cluster_threshold?: number; top_words?: number }) =>
+    client.post<ContextAnalysisResponse>("/analyze/context", data).then(r => r.data),
+  similarWords: (data: { word: string; top_k: number }) =>
+    client.post<{ word: string; similar: { word: string; score: number }[] }>("/analyze/similar-words", data).then(r => r.data),
+  getStats: () =>
+    client.get<CorpusStats>("/stats").then(r => r.data),
+  getCorpusTexts: (maxDocs: number = 500) =>
+    client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
+  // ---- Engine persistence ----
+  saveEngine: () =>
+    client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
+  hasSavedState: () =>
+    client.get<{ exists: boolean }>("/engine/has-saved-state").then(r => r.data),
+  // ---- Evaluation ----
+  getSimilarityDistribution: () =>
+    client.get<SimilarityDistribution>("/eval/similarity-distribution").then(r => r.data),
+  evalDisambiguation: (data: { ground_truth: { keyword: string; text: string; true_meaning: string }[]; candidate_meanings: Record<string, string[]> }) =>
+    client.post<{ metrics: DisambiguationMetric[] }>("/eval/disambiguation", data).then(r => r.data),
+  evalRetrieval: (data: { queries: { query: string; relevant_doc_ids?: string[]; relevant_texts?: string[] }[]; k_values: number[] }) =>
+    client.post<{ metrics: RetrievalMetric[] }>("/eval/retrieval", data).then(r => r.data),
+  // ---- Word2Vec ----
+  w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
+    client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
+  w2vCompare: (data: { text_a: string; text_b: string }) =>
+    client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
+  w2vQuery: (data: { text: string; top_k: number }) =>
+    client.post<{ query: string; results: W2VQueryResult[] }>("/w2v/query", data).then(r => r.data),
+  w2vSimilarWords: (data: { word: string; top_k: number }) =>
+    client.post<{ word: string; similar: W2VSimilarWord[] }>("/w2v/similar-words", data).then(r => r.data),
+  // ---- Dataset (HuggingFace) ----
+  datasetInfo: () =>
+    client.get<DatasetInfo>("/dataset/info").then(r => r.data),
+  datasetLoad: (data: DatasetLoadRequest) =>
+    client.post<DatasetLoadResponse>("/dataset/load", data, long).then(r => r.data),
+  datasetPreview: (maxDocs: number = 10, sourceFilter?: string) =>
+    client.post<DatasetPreviewResponse>(`/dataset/preview?max_docs=${maxDocs}${sourceFilter ? `&source_filter=${sourceFilter}` : ""}`).then(r => r.data),
+};

frontend/src/assets/react.svg ADDED Viewed

frontend/src/components/BatchAnalysis.tsx ADDED Viewed

	@@ -0,0 +1,110 @@

+import { useState } from "react";
+import { api } from "../api";
+import type { KeywordAnalysisResponse } from "../types";
+import { useApiCall } from "../hooks/useApiCall";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+export default function BatchAnalysis() {
+  const [keywordsText, setKeywordsText] = useState("");
+  const [topK, setTopK] = useState(5);
+  const [threshold, setThreshold] = useState(0.4);
+  const { data: results, loading, error, run } = useApiCall<Record<string, KeywordAnalysisResponse>>();
+  async function handleAnalyze() {
+    const keywords = keywordsText.split("\n").map((s) => s.trim()).filter(Boolean);
+    if (keywords.length === 0) return;
+    await run(() => api.batchAnalyze({ keywords, top_k: topK, cluster_threshold: threshold, compare_across: true }));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Batch Keyword Analysis</h2>
+        <p className="panel-desc">
+          Analyze multiple keywords at once and compare their semantic relationships.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Keywords (one per line)</label>
+            <textarea
+              value={keywordsText}
+              onChange={(e) => setKeywordsText(e.target.value)}
+              placeholder={`pizza\nschool\nhomework`}
+              rows={4}
+            />
+          </div>
+          <div className="flex-col gap-1">
+            <div className="form-group form-group-sm">
+              <label>Top K</label>
+              <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
+            </div>
+            <div className="form-group form-group-md">
+              <label>Cluster Threshold</label>
+              <input type="number" value={threshold} onChange={(e) => setThreshold(+e.target.value)} min={0.1} max={1} step={0.05} />
+            </div>
+          </div>
+        </div>
+        <button className="btn btn-primary" onClick={handleAnalyze} disabled={loading || !keywordsText.trim()}>
+          {loading ? "Analyzing..." : "Analyze All"}
+        </button>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {results && (
+        <>
+          {Object.values(results).some((a) => Object.keys(a.cross_keyword_similarities).length > 0) && (
+            <div className="panel">
+              <h3>Cross-Keyword Similarity</h3>
+              <table className="data-table">
+                <thead>
+                  <tr>
+                    <th>Keyword</th>
+                    {Object.keys(results).map((kw) => (
+                      <th key={kw}>{kw}</th>
+                    ))}
+                  </tr>
+                </thead>
+                <tbody>
+                  {Object.entries(results).map(([kw, analysis]) => (
+                    <tr key={kw}>
+                      <td style={{ fontWeight: 600 }}>{kw}</td>
+                      {Object.keys(results).map((other) => (
+                        <td key={other}>
+                          {kw === other ? (
+                            <span className="text-dim">-</span>
+                          ) : (
+                            <ScoreBar score={analysis.cross_keyword_similarities[other] ?? 0} />
+                          )}
+                        </td>
+                      ))}
+                    </tr>
+                  ))}
+                </tbody>
+              </table>
+            </div>
+          )}
+          {Object.entries(results).map(([kw, analysis]) => (
+            <div key={kw} className="panel">
+              <h3>
+                "{kw}" &mdash; {analysis.total_occurrences} occurrence(s),{" "}
+                {analysis.meaning_clusters.length} cluster(s)
+              </h3>
+              {analysis.meaning_clusters.map((cluster) => (
+                <div key={cluster.cluster_id} className="result-card mt-1">
+                  <div className="result-header">
+                    <strong>Cluster {cluster.cluster_id}</strong>
+                    <span className="tag">{cluster.size} occurrence(s)</span>
+                  </div>
+                  <div className="result-text">{cluster.representative_text.slice(0, 200)}...</div>
+                </div>
+              ))}
+            </div>
+          ))}
+        </>
+      )}
+    </div>
+  );
+}

frontend/src/components/ContextAnalysis.tsx ADDED Viewed

	@@ -0,0 +1,116 @@

+import { useState } from "react";
+import { api } from "../api";
+import type { ContextAnalysisResponse } from "../types";
+import { useApiCall } from "../hooks/useApiCall";
+import StatusMessage from "./StatusMessage";
+export default function ContextAnalysis() {
+  const [keyword, setKeyword] = useState("");
+  const { data: result, loading, error, run } = useApiCall<ContextAnalysisResponse>();
+  async function handleAnalyze() {
+    if (!keyword.trim()) return;
+    await run(() => api.analyzeContext({ keyword: keyword.trim() }));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Context Analysis</h2>
+        <p className="panel-desc">
+          Enter a keyword to discover what it likely means based on how it's used in the corpus.
+          The engine clusters all occurrences and extracts the most associated words for each meaning.
+        </p>
+        <div className="flex-row" style={{ alignItems: "flex-end" }}>
+          <div className="form-group form-group-lg">
+            <label>Keyword</label>
+            <input
+              value={keyword}
+              onChange={(e) => setKeyword(e.target.value)}
+              onKeyDown={(e) => e.key === "Enter" && handleAnalyze()}
+              placeholder="e.g. Epstein, flight, island"
+            />
+          </div>
+          <button
+            className="btn btn-primary"
+            onClick={handleAnalyze}
+            disabled={loading || !keyword.trim()}
+            style={{ height: 38 }}
+          >
+            {loading ? "Analyzing..." : "Analyze"}
+          </button>
+        </div>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {result && result.total_occurrences === 0 && (
+        <StatusMessage type="err" message={`No occurrences of "${result.keyword}" found in the corpus.`} />
+      )}
+      {result && result.meanings.length > 0 && (
+        <div className="panel">
+          <h2>
+            "{result.keyword}" — {result.total_occurrences} occurrences, {result.meanings.length} meaning{result.meanings.length > 1 ? "s" : ""}
+          </h2>
+          <div className="flex-col gap-3">
+            {result.meanings.map((meaning, idx) => (
+              <div key={meaning.cluster_id} className="result-card">
+                <div className="result-header">
+                  <span style={{ fontWeight: 600, fontSize: "0.9rem" }}>
+                    Meaning {idx + 1}
+                  </span>
+                  <div className="flex-row">
+                    <span className="badge">
+                      {meaning.occurrences} occurrence{meaning.occurrences > 1 ? "s" : ""}
+                    </span>
+                    <span
+                      className="badge"
+                      style={{
+                        background: `rgba(${meaning.confidence > 0.5 ? "74, 222, 128" : "108, 140, 255"}, 0.15)`,
+                        color: meaning.confidence > 0.5 ? "var(--ok)" : "var(--accent)",
+                      }}
+                    >
+                      {(meaning.confidence * 100).toFixed(1)}%
+                    </span>
+                  </div>
+                </div>
+                {/* Associated words bar chart */}
+                <div className="mt-2">
+                  {meaning.associated_words.map((aw) => {
+                    const maxScore = meaning.associated_words[0]?.score || 1;
+                    const pct = Math.round((aw.score / maxScore) * 100);
+                    return (
+                      <div key={aw.word} className="context-bar-row">
+                        <span className="context-bar-label">{aw.word}</span>
+                        <div className="context-bar-track">
+                          <div className="context-bar-fill" style={{ width: `${pct}%` }} />
+                        </div>
+                        <span className="context-bar-value">{(aw.score * 100).toFixed(0)}</span>
+                      </div>
+                    );
+                  })}
+                </div>
+                {/* Example snippets */}
+                {meaning.example_contexts.length > 0 && (
+                  <div className="mt-2">
+                    <div className="section-label">Example contexts</div>
+                    {meaning.example_contexts.map((ex, i) => (
+                      <div key={i} className="context-snippet">
+                        <span className="context-snippet-source">{ex.doc_id}</span>
+                        {ex.snippet}
+                      </div>
+                    ))}
+                  </div>
+                )}
+              </div>
+            ))}
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/DatasetPanel.tsx ADDED Viewed

	@@ -0,0 +1,246 @@

+import { useState, useEffect } from "react";
+import { api, getErrorMessage } from "../api";
+import type { DatasetInfo, DatasetLoadResponse, DatasetPreviewDoc } from "../types";
+import StatusMessage from "./StatusMessage";
+import MetricCard from "./MetricCard";
+import Toggle from "./Toggle";
+import Select from "./Select";
+import Switch from "./Switch";
+import LogViewer from "./LogViewer";
+interface Props {
+  onStatsUpdate?: (stats: any) => void;
+}
+export default function DatasetPanel({ onStatsUpdate }: Props) {
+  const [info, setInfo] = useState<DatasetInfo | null>(null);
+  const [error, setError] = useState("");
+  // Load config
+  const [source, setSource] = useState<"raw" | "embeddings">("raw");
+  const [maxDocs, setMaxDocs] = useState(500);
+  const [minTextLen, setMinTextLen] = useState(100);
+  const [sourceFilter, setSourceFilter] = useState("");
+  const [loadAll, setLoadAll] = useState(true);
+  const [buildIndex, setBuildIndex] = useState(true);
+  const [loading, setLoading] = useState(false);
+  const [loadResult, setLoadResult] = useState<DatasetLoadResponse | null>(null);
+  const [showAdvanced, setShowAdvanced] = useState(false);
+  // Preview
+  const [previewDocs, setPreviewDocs] = useState<DatasetPreviewDoc[]>([]);
+  const [previewLoading, setPreviewLoading] = useState(false);
+  useEffect(() => {
+    api.datasetInfo().then(setInfo).catch((err) => {
+      setError(getErrorMessage(err));
+    });
+  }, []);
+  async function handlePreview() {
+    setPreviewLoading(true); setError("");
+    try {
+      const res = await api.datasetPreview(10, sourceFilter || undefined);
+      setPreviewDocs(res.documents);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setPreviewLoading(false);
+    }
+  }
+  async function handleLoad() {
+    setLoading(true); setError(""); setLoadResult(null);
+    try {
+      const res = await api.datasetLoad({
+        source,
+        max_docs: loadAll ? 100000 : maxDocs,
+        min_text_length: loadAll ? 0 : minTextLen,
+        source_filter: sourceFilter || undefined,
+        build_index: buildIndex,
+      });
+      setLoadResult(res);
+      if (onStatsUpdate) {
+        try { const s = await api.getStats(); onStatsUpdate(s); } catch (e) {
+          console.warn("Failed to refresh stats after load:", e);
+        }
+      }
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setLoading(false);
+    }
+  }
+  return (
+    <div>
+      {/* Info */}
+      <div className="panel">
+        <h2>Epstein Files Dataset</h2>
+        <p className="panel-desc">
+          Load documents from the publicly released U.S. House Oversight Committee Epstein Files
+          via HuggingFace. Two sources available:
+        </p>
+        {info && (
+          <div style={{ display: "flex", gap: 12, flexWrap: "wrap", marginBottom: 16 }}>
+            <div className={`result-card ${source === "raw" ? "result-card-selected" : ""}`}
+              style={{ flex: "1 1 280px", cursor: "pointer" }}
+              onClick={() => setSource("raw")}>
+              <div className="result-header">
+                <strong>Raw Text Documents</strong>
+                <span className="badge">{info.raw_texts.size_mb} MB</span>
+              </div>
+              <div className="result-text">{info.raw_texts.description}</div>
+              <div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
+                Columns: {info.raw_texts.columns?.join(", ")}
+              </div>
+            </div>
+            <div className={`result-card ${source === "embeddings" ? "result-card-selected" : ""}`}
+              style={{ flex: "1 1 280px", cursor: "pointer" }}
+              onClick={() => setSource("embeddings")}>
+              <div className="result-header">
+                <strong>Pre-computed Embeddings</strong>
+                <span className="badge">{info.embeddings.vector_dim}d</span>
+              </div>
+              <div className="result-text">{info.embeddings.description}</div>
+              <div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
+                Model: {info.embeddings.model}
+              </div>
+            </div>
+          </div>
+        )}
+        <Toggle
+          options={[
+            { value: "raw", label: "Raw Texts" },
+            { value: "embeddings", label: "ChromaDB Embeddings" },
+          ]}
+          value={source}
+          onChange={(v) => setSource(v as "raw" | "embeddings")}
+        />
+      </div>
+      {/* Load actions + advanced config */}
+      <div className="panel">
+        <h2>Load Dataset</h2>
+        <div style={{ display: "flex", gap: 8, marginBottom: 12 }}>
+          <button className="btn btn-primary" onClick={handleLoad}
+            disabled={loading}>
+            {loading ? <><span className="spinner" /> Loading Dataset...</> : "Load into Engine"}
+          </button>
+          {source === "raw" && (
+            <button className="btn btn-secondary" onClick={handlePreview}
+              disabled={previewLoading}>
+              {previewLoading ? "Loading..." : "Preview Documents"}
+            </button>
+          )}
+        </div>
+        <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
+          {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
+        </button>
+        {showAdvanced && (
+          <div className="advanced-section">
+            <div className="form-row">
+              <div className="form-group" style={{ maxWidth: 200 }}>
+                <label>Load All Documents</label>
+                <Switch checked={loadAll} onChange={setLoadAll}
+                  label={loadAll ? "Yes (no limits)" : "No (use filters below)"} />
+              </div>
+              {!loadAll && (
+                <>
+                  <div className="form-group" style={{ maxWidth: 140 }}>
+                    <label>Max Documents</label>
+                    <input type="number" value={maxDocs} onChange={e => setMaxDocs(+e.target.value)}
+                      min={10} max={100000} />
+                  </div>
+                  {source === "raw" && (
+                    <div className="form-group" style={{ maxWidth: 140 }}>
+                      <label>Min Text Length</label>
+                      <input type="number" value={minTextLen} onChange={e => setMinTextLen(+e.target.value)}
+                        min={0} max={10000} />
+                    </div>
+                  )}
+                </>
+              )}
+              {source === "raw" && (
+                <div className="form-group" style={{ maxWidth: 220 }}>
+                  <label>Source Filter</label>
+                  <Select
+                    options={[
+                      { value: "", label: "All sources" },
+                      { value: "TEXT-", label: "TEXT- (native text files)" },
+                      { value: "IMAGES-", label: "IMAGES- (OCR from images)" },
+                    ]}
+                    value={sourceFilter}
+                    onChange={setSourceFilter}
+                  />
+                </div>
+              )}
+              <div className="form-group" style={{ maxWidth: 200 }}>
+                <label>Build Index</label>
+                <Switch checked={buildIndex} onChange={setBuildIndex}
+                  label={buildIndex ? "Yes (ready to search)" : "No (load only)"} />
+              </div>
+            </div>
+          </div>
+        )}
+        {loading && (
+          <StatusMessage type="loading"
+            message="Downloading from HuggingFace and indexing. This may take several minutes for large datasets..." />
+        )}
+        <LogViewer active={loading} />
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {/* Load result */}
+      {loadResult && (
+        <div className="panel">
+          <h2>Dataset Loaded</h2>
+          <div className="metric-grid mb-2">
+            {loadResult.documents_loaded !== undefined && (
+              <MetricCard value={loadResult.documents_loaded} label="Documents" />
+            )}
+            {loadResult.documents_created !== undefined && (
+              <MetricCard value={loadResult.documents_created} label="Documents" />
+            )}
+            {(loadResult.total_chunks || loadResult.chunks_indexed) && (
+              <MetricCard value={loadResult.total_chunks || loadResult.chunks_indexed || 0} label="Chunks" />
+            )}
+            {loadResult.chromadb_vectors !== undefined && (
+              <MetricCard value={loadResult.chromadb_vectors} label="Vectors Imported" />
+            )}
+            <MetricCard value={`${loadResult.seconds}s`} label="Time" />
+          </div>
+          <StatusMessage type="ok"
+            message={loadResult.index_built
+              ? "Dataset loaded and FAISS index built. You can now search, analyze keywords, and run evaluations."
+              : "Dataset loaded. Build the index from the Setup tab to enable search."} />
+        </div>
+      )}
+      {/* Preview */}
+      {previewDocs.length > 0 && (
+        <div className="panel">
+          <h2>Document Preview ({previewDocs.length} docs)</h2>
+          {previewDocs.map((doc, i) => (
+            <div key={i} className="result-card" style={{ marginBottom: 8 }}>
+              <div className="result-header">
+                <span style={{ fontWeight: 600, fontSize: "0.85rem" }}>{doc.filename}</span>
+                <span className="badge">{(doc.text_length / 1000).toFixed(1)}K chars</span>
+              </div>
+              <div className="result-text" style={{ whiteSpace: "pre-wrap" }}>
+                {doc.text_preview}
+              </div>
+            </div>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/EngineSetup.tsx ADDED Viewed

	@@ -0,0 +1,172 @@

+import { useState } from "react";
+import { api, getErrorMessage } from "../api";
+import type { CorpusStats } from "../types";
+import StatusMessage from "./StatusMessage";
+import Select from "./Select";
+interface Props {
+  onStatsUpdate: (stats: CorpusStats) => void;
+}
+const MODELS = [
+  { value: "all-MiniLM-L6-v2", label: "all-MiniLM-L6-v2 (fast, 384-dim)" },
+  { value: "all-mpnet-base-v2", label: "all-mpnet-base-v2 (best quality, 768-dim)" },
+  { value: "BAAI/bge-large-en-v1.5", label: "BAAI/bge-large-en-v1.5 (high accuracy, 1024-dim)" },
+];
+export default function EngineSetup({ onStatsUpdate }: Props) {
+  const [model, setModel] = useState("all-MiniLM-L6-v2");
+  const [chunkSize, setChunkSize] = useState(512);
+  const [chunkOverlap, setChunkOverlap] = useState(128);
+  const [batchSize, setBatchSize] = useState(64);
+  const [docId, setDocId] = useState("");
+  const [docText, setDocText] = useState("");
+  const [showAdvanced, setShowAdvanced] = useState(false);
+  const [status, setStatus] = useState<{ type: "ok" | "err" | "loading"; msg: string } | null>(null);
+  const [initialized, setInitialized] = useState(false);
+  const [docsAdded, setDocsAdded] = useState<string[]>([]);
+  async function handleInit() {
+    setStatus({ type: "loading", msg: "Loading model..." });
+    try {
+      const res = await api.init({
+        model_name: model,
+        chunk_size: chunkSize,
+        chunk_overlap: chunkOverlap,
+        batch_size: batchSize,
+      });
+      setInitialized(true);
+      setDocsAdded([]);
+      setStatus({ type: "ok", msg: `Model "${res.model}" loaded in ${res.load_time_seconds}s` });
+    } catch (e: unknown) {
+      setStatus({ type: "err", msg: getErrorMessage(e) });
+    }
+  }
+  async function handleAddDoc() {
+    if (!docId.trim() || !docText.trim()) return;
+    setStatus({ type: "loading", msg: `Adding document "${docId}"...` });
+    try {
+      const res = await api.addDocument({ doc_id: docId, text: docText });
+      setDocsAdded((prev) => [...prev, res.doc_id]);
+      setStatus({ type: "ok", msg: `Added "${res.doc_id}": ${res.num_chunks} chunks` });
+      setDocId("");
+      setDocText("");
+    } catch (e: unknown) {
+      setStatus({ type: "err", msg: getErrorMessage(e) });
+    }
+  }
+  async function handleBuildIndex() {
+    setStatus({ type: "loading", msg: "Building FAISS index..." });
+    try {
+      const res = await api.buildIndex();
+      setStatus({
+        type: "ok",
+        msg: `Index built: ${res.total_chunks} vectors (dim=${res.embedding_dim}) in ${res.build_time_seconds}s`,
+      });
+      const stats = await api.getStats();
+      onStatsUpdate(stats);
+    } catch (e: unknown) {
+      setStatus({ type: "err", msg: getErrorMessage(e) });
+    }
+  }
+  return (
+    <div>
+      {/* Step 1: Initialize engine */}
+      <div className="panel">
+        <h2>1. Initialize Engine</h2>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Model</label>
+            <Select options={MODELS} value={model} onChange={setModel} />
+          </div>
+        </div>
+        <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
+          {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
+        </button>
+        {showAdvanced && (
+          <div className="advanced-section">
+            <div className="form-row">
+              <div className="form-group form-group-md">
+                <label>Chunk Size</label>
+                <input type="number" value={chunkSize} onChange={(e) => setChunkSize(+e.target.value)} />
+              </div>
+              <div className="form-group form-group-md">
+                <label>Overlap</label>
+                <input type="number" value={chunkOverlap} onChange={(e) => setChunkOverlap(+e.target.value)} />
+              </div>
+              <div className="form-group form-group-md">
+                <label>Batch Size</label>
+                <input type="number" value={batchSize} onChange={(e) => setBatchSize(+e.target.value)} />
+              </div>
+            </div>
+          </div>
+        )}
+        <button className="btn btn-primary" onClick={handleInit} style={{ marginTop: 8 }}>
+          Initialize
+        </button>
+      </div>
+      {/* Step 2: Add documents */}
+      <div className="panel">
+        <h2>2. Add Documents</h2>
+        {docsAdded.length > 0 && (
+          <div style={{ marginBottom: 12 }}>
+            {docsAdded.map((id) => (
+              <span key={id} className="tag">{id}</span>
+            ))}
+          </div>
+        )}
+        <div className="form-row">
+          <div className="form-group form-group-lg">
+            <label>Document ID</label>
+            <input
+              value={docId}
+              onChange={(e) => setDocId(e.target.value)}
+              placeholder="e.g. chapter_1"
+              disabled={!initialized}
+            />
+          </div>
+        </div>
+        <div className="form-group mb-2">
+          <label>Document Text</label>
+          <textarea
+            value={docText}
+            onChange={(e) => setDocText(e.target.value)}
+            placeholder="Paste your document text here..."
+            rows={8}
+            disabled={!initialized}
+          />
+        </div>
+        <button className="btn btn-primary" onClick={handleAddDoc} disabled={!initialized || !docId || !docText}>
+          Add Document
+        </button>
+      </div>
+      {/* Step 3: Build index */}
+      <div className="panel">
+        <h2>3. Build Index</h2>
+        <p className="panel-desc">
+          Embeds all chunks and builds a FAISS index for fast similarity search.
+          This must be done after adding all documents.
+        </p>
+        <button
+          className="btn btn-primary"
+          onClick={handleBuildIndex}
+          disabled={!initialized || docsAdded.length === 0}
+        >
+          Build Index
+        </button>
+      </div>
+      {status && <StatusMessage type={status.type} message={status.msg} />}
+    </div>
+  );
+}

frontend/src/components/EvaluationDashboard.tsx ADDED Viewed

	@@ -0,0 +1,603 @@

+import { useState } from "react";
+import {
+  BarChart,
+  Bar,
+  XAxis,
+  YAxis,
+  CartesianGrid,
+  Tooltip,
+  ResponsiveContainer,
+  Cell,
+} from "recharts";
+import { api, getErrorMessage } from "../api";
+import type { EvalSection, SimilarityDistribution, DisambiguationMetric, RetrievalMetric } from "../types";
+import StatusMessage from "./StatusMessage";
+import MetricCard from "./MetricCard";
+// ---- Structured form types ----
+interface GtRow {
+  text: string;
+  meaning: string;
+}
+interface RetrievalRow {
+  query: string;
+  relevantText: string;
+}
+// ---- Example data ----
+const EXAMPLE_KEYWORD = "pizza";
+const EXAMPLE_MEANINGS = [
+  "school, education, and academic activities like homework and tests",
+  "food, Italian cuisine, restaurant, cooking, and eating",
+];
+const EXAMPLE_GT: GtRow[] = [
+  { text: "I love pizza so much, I go there every day", meaning: "school" },
+  { text: "pizza gives me homework", meaning: "school" },
+  { text: "she made the best margherita pizza in the city", meaning: "food" },
+  { text: "pizza dough recipe used tipo 00 flour", meaning: "food" },
+  { text: "The pizza test is going to be so hard", meaning: "school" },
+  { text: "This pizza is amazing, the crust is perfectly crispy", meaning: "food" },
+];
+const EXAMPLE_RETRIEVAL: RetrievalRow[] = [
+  { query: "kids using secret code words for school", relevantText: "secret language" },
+  { query: "Italian restaurant with wood-fired oven", relevantText: "pizza" },
+];
+// ---- Meaning label helpers ----
+function getMeaningLabels(meanings: string[]): string[] {
+  return meanings.map((m) => {
+    const first = m.split(",")[0].trim();
+    return first.length > 20 ? first.slice(0, 20) : first;
+  });
+}
+// ---- Tab config ----
+const EVAL_TABS: { id: EvalSection; label: string; desc: string }[] = [
+  {
+    id: "distribution",
+    label: "Distribution",
+    desc: "Analyze pairwise similarity distribution across your corpus. One-click — no setup needed.",
+  },
+  {
+    id: "disambiguation",
+    label: "Disambiguation",
+    desc: "Test whether the engine can tell apart different meanings of the same word. Provide example sentences and label each with the intended meaning.",
+  },
+  {
+    id: "retrieval",
+    label: "Retrieval",
+    desc: "Measure how well the engine finds relevant documents for a given query. Provide search queries and what text they should match.",
+  },
+];
+export default function EvaluationDashboard() {
+  const [section, setSection] = useState<EvalSection>("distribution");
+  const [distrib, setDistrib] = useState<SimilarityDistribution | null>(null);
+  const [disambig, setDisambig] = useState<DisambiguationMetric[] | null>(null);
+  const [retrieval, setRetrieval] = useState<RetrievalMetric[] | null>(null);
+  const [loading, setLoading] = useState("");
+  const [error, setError] = useState("");
+  // Disambiguation structured form
+  const [keyword, setKeyword] = useState("");
+  const [meanings, setMeanings] = useState<string[]>(["", ""]);
+  const [gtRows, setGtRows] = useState<GtRow[]>([{ text: "", meaning: "" }]);
+  // Retrieval structured form
+  const [retRows, setRetRows] = useState<RetrievalRow[]>([{ query: "", relevantText: "" }]);
+  // ---- Distribution ----
+  async function fetchDistribution() {
+    setLoading("distrib");
+    setError("");
+    try {
+      setDistrib(await api.getSimilarityDistribution());
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setLoading("");
+    }
+  }
+  // ---- Disambiguation ----
+  function loadDisambiguationExample() {
+    setKeyword(EXAMPLE_KEYWORD);
+    setMeanings([...EXAMPLE_MEANINGS]);
+    setGtRows(EXAMPLE_GT.map((r) => ({ ...r })));
+  }
+  function updateMeaning(i: number, val: string) {
+    const next = [...meanings];
+    next[i] = val;
+    setMeanings(next);
+  }
+  function addMeaning() {
+    setMeanings([...meanings, ""]);
+  }
+  function removeMeaning(i: number) {
+    if (meanings.length <= 2) return;
+    setMeanings(meanings.filter((_, idx) => idx !== i));
+    // Update GT rows that referenced removed meaning
+    const labels = getMeaningLabels(meanings);
+    const removed = labels[i];
+    setGtRows(gtRows.map((r) => (r.meaning === removed ? { ...r, meaning: "" } : r)));
+  }
+  function updateGtRow(i: number, field: keyof GtRow, val: string) {
+    const next = [...gtRows];
+    next[i] = { ...next[i], [field]: val };
+    setGtRows(next);
+  }
+  function addGtRow() {
+    setGtRows([...gtRows, { text: "", meaning: "" }]);
+  }
+  function removeGtRow(i: number) {
+    if (gtRows.length <= 1) return;
+    setGtRows(gtRows.filter((_, idx) => idx !== i));
+  }
+  async function runDisambiguation() {
+    if (!keyword.trim()) { setError("Enter a keyword."); return; }
+    const validMeanings = meanings.filter((m) => m.trim());
+    if (validMeanings.length < 2) { setError("Add at least 2 meanings."); return; }
+    const validGt = gtRows.filter((r) => r.text.trim() && r.meaning);
+    if (validGt.length < 2) { setError("Add at least 2 labeled examples."); return; }
+    setLoading("disambig");
+    setError("");
+    try {
+      const labels = getMeaningLabels(meanings);
+      const ground_truth = validGt.map((r) => ({
+        keyword: keyword.trim(),
+        text: r.text,
+        true_meaning: r.meaning,
+      }));
+      const candidate_meanings: Record<string, string[]> = {
+        [keyword.trim()]: validMeanings,
+      };
+      // Map GT meaning labels back to full candidate strings for the API
+      // The API compares against candidates, so true_meaning should match a candidate label
+      // We use short labels for the dropdown, but the API uses them as-is for matching
+      const res = await api.evalDisambiguation({ ground_truth, candidate_meanings });
+      setDisambig(res.metrics);
+    } catch (e) {
+      setError(getErrorMessage(e));
+    } finally {
+      setLoading("");
+    }
+  }
+  // ---- Retrieval ----
+  function loadRetrievalExample() {
+    setRetRows(EXAMPLE_RETRIEVAL.map((r) => ({ ...r })));
+  }
+  function updateRetRow(i: number, field: keyof RetrievalRow, val: string) {
+    const next = [...retRows];
+    next[i] = { ...next[i], [field]: val };
+    setRetRows(next);
+  }
+  function addRetRow() {
+    setRetRows([...retRows, { query: "", relevantText: "" }]);
+  }
+  function removeRetRow(i: number) {
+    if (retRows.length <= 1) return;
+    setRetRows(retRows.filter((_, idx) => idx !== i));
+  }
+  async function runRetrieval() {
+    const valid = retRows.filter((r) => r.query.trim());
+    if (valid.length === 0) { setError("Add at least one query."); return; }
+    setLoading("retrieval");
+    setError("");
+    try {
+      const queries = valid.map((r) => ({
+        query: r.query,
+        relevant_texts: r.relevantText.trim() ? [r.relevantText.trim()] : [],
+      }));
+      const res = await api.evalRetrieval({ queries, k_values: [1, 3, 5, 10] });
+      setRetrieval(res.metrics);
+    } catch (e) {
+      setError(getErrorMessage(e));
+    } finally {
+      setLoading("");
+    }
+  }
+  // ---- Meaning labels for dropdown ----
+  const meaningLabels = getMeaningLabels(meanings);
+  return (
+    <div>
+      <nav className="subtabs mb-2">
+        {EVAL_TABS.map((t) => (
+          <button
+            key={t.id}
+            className={`subtab ${section === t.id ? "subtab-active" : ""}`}
+            onClick={() => { setSection(t.id); setError(""); }}
+          >
+            {t.label}
+          </button>
+        ))}
+      </nav>
+      <p className="panel-desc">{EVAL_TABS.find((t) => t.id === section)?.desc}</p>
+      {error && <StatusMessage type="err" message={error} />}
+      {/* ---- Similarity Distribution ---- */}
+      {section === "distribution" && (
+        <div className="panel">
+          <button className="btn btn-primary" onClick={fetchDistribution} disabled={loading === "distrib"}>
+            {loading === "distrib" ? "Computing..." : "Compute Distribution"}
+          </button>
+          {distrib && (
+            <div className="mt-2">
+              <div className="metric-grid mb-3">
+                {[
+                  { label: "Mean", value: distrib.mean },
+                  { label: "Std Dev", value: distrib.std },
+                  { label: "Min", value: distrib.min },
+                  { label: "Max", value: distrib.max },
+                ].map((m) => (
+                  <MetricCard key={m.label} value={m.value.toFixed(3)} label={m.label} />
+                ))}
+              </div>
+              <h3>Histogram</h3>
+              <ResponsiveContainer width="100%" height={250}>
+                <BarChart data={distrib.histogram}>
+                  <CartesianGrid strokeDasharray="3 3" stroke="var(--border)" />
+                  <XAxis
+                    dataKey="bin_start"
+                    tick={{ fill: "var(--text-dim)", fontSize: 11 }}
+                    tickFormatter={(v: number) => v.toFixed(1)}
+                  />
+                  <YAxis tick={{ fill: "var(--text-dim)", fontSize: 11 }} />
+                  <Tooltip
+                    contentStyle={{
+                      background: "var(--surface)",
+                      border: "1px solid var(--border)",
+                      borderRadius: 6,
+                      color: "var(--text)",
+                    }}
+                    formatter={(value: unknown) => [Number(value), "Count"]}
+                    labelFormatter={(v: unknown) => `Similarity: ${Number(v).toFixed(2)}`}
+                  />
+                  <Bar dataKey="count" radius={[4, 4, 0, 0]}>
+                    {distrib.histogram.map((entry, i) => (
+                      <Cell
+                        key={i}
+                        fill={entry.bin_start >= 0.5 ? "var(--ok)" : entry.bin_start >= 0 ? "var(--accent)" : "var(--err)"}
+                      />
+                    ))}
+                  </Bar>
+                </BarChart>
+              </ResponsiveContainer>
+              <h3 className="mt-2">Percentiles</h3>
+              <table className="data-table">
+                <thead>
+                  <tr>
+                    {Object.keys(distrib.percentiles).map((p) => (
+                      <th key={p}>P{p}</th>
+                    ))}
+                  </tr>
+                </thead>
+                <tbody>
+                  <tr>
+                    {Object.values(distrib.percentiles).map((v, i) => (
+                      <td key={i}>{v.toFixed(4)}</td>
+                    ))}
+                  </tr>
+                </tbody>
+              </table>
+            </div>
+          )}
+        </div>
+      )}
+      {/* ---- Disambiguation Evaluation ---- */}
+      {section === "disambiguation" && (
+        <div className="panel">
+          <div className="flex-row gap-2 mb-2">
+            <button className="btn btn-secondary" onClick={loadDisambiguationExample}>
+              Load Example
+            </button>
+          </div>
+          {/* Keyword */}
+          <div className="form-group mb-2" style={{ maxWidth: 300 }}>
+            <label>Keyword</label>
+            <input
+              value={keyword}
+              onChange={(e) => setKeyword(e.target.value)}
+              placeholder='e.g. "pizza"'
+            />
+          </div>
+          {/* Candidate Meanings */}
+          <div className="mb-2">
+            <label className="section-label">
+              Candidate Meanings
+              <span className="text-dim"> — describe each possible meaning</span>
+            </label>
+            {meanings.map((m, i) => (
+              <div key={i} className="flex-row gap-1 mb-1">
+                <span className="text-dim" style={{ minWidth: 24 }}>{i + 1}.</span>
+                <input
+                  value={m}
+                  onChange={(e) => updateMeaning(i, e.target.value)}
+                  placeholder={`Meaning ${i + 1} description...`}
+                  style={{ flex: 1 }}
+                />
+                {meanings.length > 2 && (
+                  <button className="btn btn-secondary" onClick={() => removeMeaning(i)}>
+                    &times;
+                  </button>
+                )}
+              </div>
+            ))}
+            <button className="btn btn-secondary mt-1" onClick={addMeaning}>
+              + Add Meaning
+            </button>
+          </div>
+          {/* Ground Truth Examples */}
+          <div className="mb-2">
+            <label className="section-label">
+              Labeled Examples
+              <span className="text-dim"> — sentences using the keyword, with the correct meaning</span>
+            </label>
+            <table className="data-table">
+              <thead>
+                <tr>
+                  <th style={{ width: "60%" }}>Sentence</th>
+                  <th>Correct Meaning</th>
+                  <th style={{ width: 40 }} />
+                </tr>
+              </thead>
+              <tbody>
+                {gtRows.map((row, i) => (
+                  <tr key={i}>
+                    <td>
+                      <input
+                        value={row.text}
+                        onChange={(e) => updateGtRow(i, "text", e.target.value)}
+                        placeholder="A sentence containing the keyword..."
+                        style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
+                      />
+                    </td>
+                    <td>
+                      <select
+                        value={row.meaning}
+                        onChange={(e) => updateGtRow(i, "meaning", e.target.value)}
+                        style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
+                      >
+                        <option value="">Select...</option>
+                        {meaningLabels.map((label, j) => (
+                          <option key={j} value={label}>{label}</option>
+                        ))}
+                      </select>
+                    </td>
+                    <td>
+                      {gtRows.length > 1 && (
+                        <button className="btn btn-secondary" onClick={() => removeGtRow(i)}>
+                          &times;
+                        </button>
+                      )}
+                    </td>
+                  </tr>
+                ))}
+              </tbody>
+            </table>
+            <button className="btn btn-secondary mt-1" onClick={addGtRow}>
+              + Add Example
+            </button>
+          </div>
+          <button
+            className="btn btn-primary"
+            onClick={runDisambiguation}
+            disabled={loading === "disambig"}
+          >
+            {loading === "disambig" ? "Evaluating..." : "Run Evaluation"}
+          </button>
+          {disambig && disambig.map((m) => (
+            <div key={m.keyword} className="mt-3">
+              <h3>Results: "{m.keyword}" ({m.total_samples} samples)</h3>
+              <div className="metric-grid mb-2">
+                <MetricCard value={`${(m.accuracy * 100).toFixed(1)}%`} label="Accuracy" />
+                <MetricCard value={`${(m.weighted_f1 * 100).toFixed(1)}%`} label="Weighted F1" />
+              </div>
+              <h3>Per-Meaning Scores</h3>
+              <table className="data-table">
+                <thead>
+                  <tr>
+                    <th>Meaning</th>
+                    <th>Precision</th>
+                    <th>Recall</th>
+                    <th>F1</th>
+                  </tr>
+                </thead>
+                <tbody>
+                  {Object.keys(m.per_meaning_f1).map((meaning) => (
+                    <tr key={meaning}>
+                      <td>{meaning}</td>
+                      <td>{m.per_meaning_precision[meaning]?.toFixed(4) ?? "-"}</td>
+                      <td>{m.per_meaning_recall[meaning]?.toFixed(4) ?? "-"}</td>
+                      <td style={{ fontWeight: 700 }}>{m.per_meaning_f1[meaning]?.toFixed(4) ?? "-"}</td>
+                    </tr>
+                  ))}
+                </tbody>
+              </table>
+              {m.confusion_matrix && (
+                <>
+                  <h3 className="mt-2">Confusion Matrix</h3>
+                  <table className="data-table">
+                    <thead>
+                      <tr>
+                        <th>True \ Predicted</th>
+                        {Object.keys(m.per_meaning_f1).map((meaning) => (
+                          <th key={meaning}>{meaning}</th>
+                        ))}
+                      </tr>
+                    </thead>
+                    <tbody>
+                      {m.confusion_matrix.map((row, i) => (
+                        <tr key={i}>
+                          <td style={{ fontWeight: 600 }}>{Object.keys(m.per_meaning_f1)[i]}</td>
+                          {row.map((val, j) => (
+                            <td
+                              key={j}
+                              style={{
+                                fontWeight: i === j ? 700 : 400,
+                                color: i === j ? "var(--ok)" : val > 0 ? "var(--err)" : "var(--text-dim)",
+                              }}
+                            >
+                              {val}
+                            </td>
+                          ))}
+                        </tr>
+                      ))}
+                    </tbody>
+                  </table>
+                </>
+              )}
+            </div>
+          ))}
+        </div>
+      )}
+      {/* ---- Retrieval Evaluation ---- */}
+      {section === "retrieval" && (
+        <div className="panel">
+          <div className="flex-row gap-2 mb-2">
+            <button className="btn btn-secondary" onClick={loadRetrievalExample}>
+              Load Example
+            </button>
+          </div>
+          <label className="section-label">
+            Search Queries
+            <span className="text-dim"> — enter queries and what text they should find</span>
+          </label>
+          <table className="data-table mb-2">
+            <thead>
+              <tr>
+                <th style={{ width: "50%" }}>Query</th>
+                <th>Expected Match (text snippet)</th>
+                <th style={{ width: 40 }} />
+              </tr>
+            </thead>
+            <tbody>
+              {retRows.map((row, i) => (
+                <tr key={i}>
+                  <td>
+                    <input
+                      value={row.query}
+                      onChange={(e) => updateRetRow(i, "query", e.target.value)}
+                      placeholder="A search query..."
+                      style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
+                    />
+                  </td>
+                  <td>
+                    <input
+                      value={row.relevantText}
+                      onChange={(e) => updateRetRow(i, "relevantText", e.target.value)}
+                      placeholder="Text that should match..."
+                      style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
+                    />
+                  </td>
+                  <td>
+                    {retRows.length > 1 && (
+                      <button className="btn btn-secondary" onClick={() => removeRetRow(i)}>
+                        &times;
+                      </button>
+                    )}
+                  </td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+          <div className="flex-row gap-2 mb-2">
+            <button className="btn btn-secondary" onClick={addRetRow}>
+              + Add Query
+            </button>
+            <button
+              className="btn btn-primary"
+              onClick={runRetrieval}
+              disabled={loading === "retrieval"}
+            >
+              {loading === "retrieval" ? "Evaluating..." : "Run Evaluation"}
+            </button>
+          </div>
+          {retrieval && (
+            <div className="mt-2">
+              <table className="data-table">
+                <thead>
+                  <tr>
+                    <th>Query</th>
+                    <th>MRR</th>
+                    <th>P@1</th>
+                    <th>P@3</th>
+                    <th>P@5</th>
+                    <th>Top Score</th>
+                  </tr>
+                </thead>
+                <tbody>
+                  {retrieval.map((m, i) => (
+                    <tr key={i}>
+                      <td style={{ maxWidth: 300 }}>{m.query.length > 50 ? m.query.slice(0, 50) + "..." : m.query}</td>
+                      <td>{m.mrr.toFixed(3)}</td>
+                      <td>{m.precision_at_k["1"]?.toFixed(2) ?? "-"}</td>
+                      <td>{m.precision_at_k["3"]?.toFixed(2) ?? "-"}</td>
+                      <td>{m.precision_at_k["5"]?.toFixed(2) ?? "-"}</td>
+                      <td>{m.top_score.toFixed(3)}</td>
+                    </tr>
+                  ))}
+                </tbody>
+              </table>
+              <div className="metric-grid mt-3">
+                <MetricCard
+                  value={(retrieval.reduce((s, m) => s + m.mrr, 0) / retrieval.length).toFixed(3)}
+                  label="Mean MRR"
+                />
+                <MetricCard
+                  value={(retrieval.reduce((s, m) => s + (m.precision_at_k["5"] ?? 0), 0) / retrieval.length).toFixed(3)}
+                  label="Mean P@5"
+                />
+                <MetricCard
+                  value={(retrieval.reduce((s, m) => s + m.top_score, 0) / retrieval.length).toFixed(3)}
+                  label="Mean Top Score"
+                />
+              </div>
+            </div>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/KeywordAnalysis.tsx ADDED Viewed

	@@ -0,0 +1,100 @@

+import { useState } from "react";
+import { api } from "../api";
+import type { KeywordAnalysisResponse } from "../types";
+import { useApiCall } from "../hooks/useApiCall";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+export default function KeywordAnalysis() {
+  const [keyword, setKeyword] = useState("");
+  const [topK, setTopK] = useState(5);
+  const [threshold, setThreshold] = useState(0.4);
+  const { data: analysis, loading, error, run } = useApiCall<KeywordAnalysisResponse>();
+  async function handleAnalyze() {
+    if (!keyword.trim()) return;
+    await run(() => api.analyzeKeyword({ keyword, top_k: topK, cluster_threshold: threshold }));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Keyword Analysis</h2>
+        <p className="panel-desc">
+          Find all occurrences of a keyword, cluster them by contextual meaning,
+          and discover semantically similar passages for each meaning.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Keyword</label>
+            <input
+              value={keyword}
+              onChange={(e) => setKeyword(e.target.value)}
+              placeholder="e.g. pizza"
+              onKeyDown={(e) => e.key === "Enter" && handleAnalyze()}
+            />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
+          </div>
+          <div className="form-group form-group-md">
+            <label>Cluster Threshold</label>
+            <input type="number" value={threshold} onChange={(e) => setThreshold(+e.target.value)} min={0.1} max={1} step={0.05} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleAnalyze} disabled={loading || !keyword.trim()}>
+              {loading ? "Analyzing..." : "Analyze"}
+            </button>
+          </div>
+        </div>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {analysis && (
+        <div className="panel">
+          <h3>
+            "{analysis.keyword}" &mdash; {analysis.total_occurrences} occurrence(s),{" "}
+            {analysis.meaning_clusters.length} meaning cluster(s)
+          </h3>
+          {analysis.meaning_clusters.map((cluster) => (
+            <div key={cluster.cluster_id} className="result-card mt-2">
+              <div className="result-header">
+                <div>
+                  <strong>Cluster {cluster.cluster_id}</strong>{" "}
+                  <span className="tag">{cluster.size} occurrence(s)</span>
+                </div>
+              </div>
+              <div className="mt-1 mb-2">
+                <div className="section-label">Contexts:</div>
+                {cluster.contexts.map((ctx, i) => (
+                  <div key={i} className="result-text" style={{ marginBottom: 4, paddingLeft: 12 }}>
+                    <span className="badge" style={{ marginRight: 6 }}>{ctx.doc_id}</span>
+                    {ctx.text.slice(0, 200)}...
+                  </div>
+                ))}
+              </div>
+              <div>
+                <div className="section-label">Similar passages:</div>
+                {cluster.similar_passages.map((sp) => (
+                  <div key={sp.rank} className="flex-row" style={{ alignItems: "start", marginBottom: 6 }}>
+                    <ScoreBar score={sp.score} />
+                    <span className="result-text" style={{ flex: 1 }}>
+                      <span className="badge" style={{ marginRight: 4 }}>{sp.doc_id}</span>
+                      {sp.text.slice(0, 150)}...
+                    </span>
+                  </div>
+                ))}
+              </div>
+            </div>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/KeywordMatcher.tsx ADDED Viewed

	@@ -0,0 +1,90 @@

+import { useState } from "react";
+import { api, getErrorMessage } from "../api";
+import type { MatchResponse } from "../types";
+import { useApiCall } from "../hooks/useApiCall";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+export default function KeywordMatcher() {
+  const [keyword, setKeyword] = useState("");
+  const [meaningsText, setMeaningsText] = useState("");
+  const { data: results, loading, error, setError, run } = useApiCall<MatchResponse>();
+  async function handleMatch() {
+    if (!keyword.trim() || !meaningsText.trim()) return;
+    const candidates = meaningsText.split("\n").map((s) => s.trim()).filter(Boolean);
+    if (candidates.length < 2) {
+      setError("Provide at least 2 candidate meanings (one per line).");
+      return;
+    }
+    await run(() => api.matchKeyword({ keyword, candidate_meanings: candidates }));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Keyword Meaning Matcher</h2>
+        <p className="panel-desc">
+          Match each occurrence of a keyword to the most likely intended meaning.
+          For example: keyword "pizza" with candidates "food" and "school".
+        </p>
+        <div className="form-row">
+          <div className="form-group form-group-lg">
+            <label>Keyword</label>
+            <input value={keyword} onChange={(e) => setKeyword(e.target.value)} placeholder="e.g. pizza" />
+          </div>
+        </div>
+        <div className="form-group mb-2">
+          <label>Candidate Meanings (one per line)</label>
+          <textarea
+            value={meaningsText}
+            onChange={(e) => setMeaningsText(e.target.value)}
+            placeholder={`Italian food made with dough, tomato sauce, and cheese\nSchool, education, and academic activities`}
+            rows={4}
+          />
+        </div>
+        <button className="btn btn-primary" onClick={handleMatch} disabled={loading || !keyword.trim() || !meaningsText.trim()}>
+          {loading ? "Matching..." : "Match"}
+        </button>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {results && (
+        <div className="panel">
+          <h3>Matches for "{results.keyword}" ({results.matches.length} occurrences)</h3>
+          {results.matches.map((m, idx) => (
+            <div key={idx} className="result-card mt-1">
+              <div className="result-header">
+                <div>
+                  <span className="badge">{m.doc_id}</span>{" "}
+                  <span className="tag">chunk {m.chunk_index}</span>
+                </div>
+                <span className="tag tag-best">{m.best_match}</span>
+              </div>
+              <div className="result-text mb-1">{m.text.slice(0, 250)}...</div>
+              <div className="flex-row flex-wrap gap-2">
+                {Object.entries(m.all_scores).map(([meaning, score]) => (
+                  <div key={meaning} style={{ flex: "1 1 200px" }}>
+                    <div
+                      style={{
+                        fontSize: "0.78rem",
+                        color: meaning === m.best_match ? "var(--ok)" : "var(--text-dim)",
+                        fontWeight: meaning === m.best_match ? 700 : 400,
+                        marginBottom: 2,
+                      }}
+                    >
+                      {meaning.slice(0, 60)}
+                    </div>
+                    <ScoreBar score={score} />
+                  </div>
+                ))}
+              </div>
+            </div>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/LogViewer.tsx ADDED Viewed

	@@ -0,0 +1,71 @@

+import { useState, useEffect, useRef } from "react";
+interface Props {
+  /** Whether to actively stream logs */
+  active: boolean;
+}
+export default function LogViewer({ active }: Props) {
+  const [lines, setLines] = useState<string[]>([]);
+  const containerRef = useRef<HTMLDivElement>(null);
+  useEffect(() => {
+    if (!active) return;
+    setLines([]);
+    const evtSource = new EventSource("/api/logs/stream");
+    evtSource.onmessage = (event) => {
+      setLines((prev) => {
+        const next = [...prev, event.data];
+        // Keep last 200 lines
+        return next.length > 200 ? next.slice(-200) : next;
+      });
+    };
+    evtSource.onerror = () => {
+      // SSE will auto-reconnect, no action needed
+    };
+    return () => {
+      evtSource.close();
+    };
+  }, [active]);
+  useEffect(() => {
+    // Auto-scroll to bottom
+    if (containerRef.current) {
+      containerRef.current.scrollTop = containerRef.current.scrollHeight;
+    }
+  }, [lines]);
+  if (!active && lines.length === 0) return null;
+  return (
+    <div
+      ref={containerRef}
+      style={{
+        background: "#0a0c10",
+        border: "1px solid var(--border)",
+        borderRadius: "var(--radius)",
+        padding: "10px 14px",
+        marginTop: 12,
+        maxHeight: 220,
+        overflowY: "auto",
+        fontFamily: "'JetBrains Mono', 'Fira Code', 'Consolas', monospace",
+        fontSize: "0.75rem",
+        lineHeight: 1.7,
+        color: "var(--text-dim)",
+      }}
+    >
+      {lines.length === 0 && active && (
+        <span style={{ color: "var(--text-dim)", opacity: 0.5 }}>Waiting for logs...</span>
+      )}
+      {lines.map((line, i) => (
+        <div key={i} style={{ whiteSpace: "pre-wrap", wordBreak: "break-all" }}>
+          {line}
+        </div>
+      ))}
+    </div>
+  );
+}

frontend/src/components/MetricCard.tsx ADDED Viewed

	@@ -0,0 +1,16 @@

+interface Props {
+  value: string | number;
+  label: string;
+  valueColor?: string;
+}
+export default function MetricCard({ value, label, valueColor }: Props) {
+  return (
+    <div className="metric-card">
+      <div className="metric-value" style={valueColor ? { color: valueColor } : undefined}>
+        {value}
+      </div>
+      <div className="metric-label">{label}</div>
+    </div>
+  );
+}

frontend/src/components/ScoreBar.tsx ADDED Viewed

	@@ -0,0 +1,19 @@

+import { scoreColor } from "../utils/colors";
+interface ScoreBarProps {
+  score: number;
+  max?: number;
+}
+export default function ScoreBar({ score, max = 1 }: ScoreBarProps) {
+  const pct = Math.min(100, Math.max(0, (score / max) * 100));
+  const color = scoreColor(score);
+  return (
+    <div className="score-bar-container">
+      <div className="score-bar">
+        <div className="score-bar-fill" style={{ width: `${pct}%`, background: color }} />
+      </div>
+      <span className="score-label" style={{ color }}>{score.toFixed(4)}</span>
+    </div>
+  );
+}

frontend/src/components/Select.tsx ADDED Viewed

	@@ -0,0 +1,60 @@

+import { useState, useRef, useEffect } from "react";
+interface Option {
+  value: string;
+  label: string;
+}
+interface Props {
+  options: Option[];
+  value: string;
+  onChange: (value: string) => void;
+  placeholder?: string;
+}
+export default function Select({ options, value, onChange, placeholder }: Props) {
+  const [open, setOpen] = useState(false);
+  const ref = useRef<HTMLDivElement>(null);
+  useEffect(() => {
+    function handleClickOutside(e: MouseEvent) {
+      if (ref.current && !ref.current.contains(e.target as Node)) {
+        setOpen(false);
+      }
+    }
+    document.addEventListener("mousedown", handleClickOutside);
+    return () => document.removeEventListener("mousedown", handleClickOutside);
+  }, []);
+  const selected = options.find((o) => o.value === value);
+  return (
+    <div className="custom-select" ref={ref}>
+      <button
+        className="custom-select-trigger"
+        onClick={() => setOpen(!open)}
+        type="button"
+      >
+        <span>{selected?.label || placeholder || "Select..."}</span>
+        <span className="custom-select-arrow">{open ? "\u25b4" : "\u25be"}</span>
+      </button>
+      {open && (
+        <div className="custom-select-dropdown">
+          {options.map((opt) => (
+            <button
+              key={opt.value}
+              className={`custom-select-option ${opt.value === value ? "custom-select-option-active" : ""}`}
+              onClick={() => {
+                onChange(opt.value);
+                setOpen(false);
+              }}
+              type="button"
+            >
+              {opt.label}
+            </button>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/SemanticSearch.tsx ADDED Viewed

	@@ -0,0 +1,70 @@

+import { useState } from "react";
+import { api } from "../api";
+import type { QueryResultItem } from "../types";
+import { useApiCall } from "../hooks/useApiCall";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+export default function SemanticSearch() {
+  const [query, setQuery] = useState("");
+  const [topK, setTopK] = useState(10);
+  const { data: results, loading, error, run } = useApiCall<QueryResultItem[]>();
+  async function handleSearch() {
+    if (!query.trim()) return;
+    await run(() => api.query({ text: query, top_k: topK }).then((r) => r.results));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Semantic Search</h2>
+        <p className="panel-desc">
+          Find passages most semantically similar to your query across the entire corpus.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Query</label>
+            <input
+              value={query}
+              onChange={(e) => setQuery(e.target.value)}
+              placeholder="e.g. a place where children learn and take tests"
+              onKeyDown={(e) => e.key === "Enter" && handleSearch()}
+            />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleSearch} disabled={loading || !query.trim()}>
+              {loading ? "Searching..." : "Search"}
+            </button>
+          </div>
+        </div>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {results && (
+        <div className="panel">
+          <h3>Results ({results.length})</h3>
+          {results.map((r) => (
+            <div key={`${r.doc_id}-${r.chunk_index}`} className="result-card">
+              <div className="result-header">
+                <div>
+                  <span className="badge">#{r.rank}</span>{" "}
+                  <span className="badge">{r.doc_id}</span>{" "}
+                  <span className="tag">chunk {r.chunk_index}</span>
+                </div>
+                <ScoreBar score={r.score} />
+              </div>
+              <div className="result-text">{r.text}</div>
+            </div>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/SimilarWords.tsx ADDED Viewed

	@@ -0,0 +1,75 @@

+import { useState } from "react";
+import { api } from "../api";
+import { useApiCall } from "../hooks/useApiCall";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+interface SimilarWord {
+  word: string;
+  score: number;
+}
+export default function SimilarWords() {
+  const [word, setWord] = useState("");
+  const [topK, setTopK] = useState(10);
+  const { data: results, loading, error, run } = useApiCall<SimilarWord[]>();
+  async function handleSearch() {
+    if (!word.trim()) return;
+    await run(() => api.similarWords({ word: word.trim(), top_k: topK }).then((r) => r.similar));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Similar Words</h2>
+        <p className="panel-desc">
+          Find words that appear in similar contexts using transformer embeddings.
+          Unlike Word2Vec (static, one vector per word), this uses the model's contextual understanding.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Word</label>
+            <input
+              value={word}
+              onChange={(e) => setWord(e.target.value)}
+              onKeyDown={(e) => e.key === "Enter" && handleSearch()}
+              placeholder="e.g. Epstein, flight, island"
+            />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleSearch} disabled={loading || !word.trim()}>
+              {loading ? "Searching..." : "Find"}
+            </button>
+          </div>
+        </div>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {results && results.length > 0 && (
+        <div className="panel">
+          <h3>Words similar to "{word}" ({results.length})</h3>
+          <table className="data-table">
+            <thead>
+              <tr><th>Word</th><th>Similarity</th></tr>
+            </thead>
+            <tbody>
+              {results.map((r, i) => (
+                <tr key={i}>
+                  <td style={{ fontWeight: 600 }}>{r.word}</td>
+                  <td><ScoreBar score={r.score} /></td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/StatusMessage.tsx ADDED Viewed

	@@ -0,0 +1,13 @@

+interface StatusMessageProps {
+  type: "ok" | "err" | "loading";
+  message: string;
+}
+export default function StatusMessage({ type, message }: StatusMessageProps) {
+  return (
+    <div className={`status status-${type}`}>
+      {type === "loading" && <span className="spinner" />}
+      {message}
+    </div>
+  );
+}

frontend/src/components/Switch.tsx ADDED Viewed

	@@ -0,0 +1,22 @@

+interface Props {
+  checked: boolean;
+  onChange: (checked: boolean) => void;
+  label?: string;
+}
+export default function Switch({ checked, onChange, label }: Props) {
+  return (
+    <label className="switch">
+      <button
+        className={`switch-track ${checked ? "switch-track-on" : ""}`}
+        onClick={() => onChange(!checked)}
+        type="button"
+        role="switch"
+        aria-checked={checked}
+      >
+        <span className="switch-thumb" />
+      </button>
+      {label && <span className="switch-label">{label}</span>}
+    </label>
+  );
+}

frontend/src/components/TextCompare.tsx ADDED Viewed

	@@ -0,0 +1,84 @@

+import { useState } from "react";
+import { api } from "../api";
+import { useApiCall } from "../hooks/useApiCall";
+import { scoreColor } from "../utils/colors";
+import StatusMessage from "./StatusMessage";
+export default function TextCompare() {
+  const [textA, setTextA] = useState("");
+  const [textB, setTextB] = useState("");
+  const { data: similarity, loading, error, run } = useApiCall<number>();
+  async function handleCompare() {
+    if (!textA.trim() || !textB.trim()) return;
+    await run(() => api.compare({ text_a: textA, text_b: textB }).then((r) => r.similarity));
+  }
+  return (
+    <div>
+      <div className="panel">
+        <h2>Compare Texts</h2>
+        <p className="panel-desc">
+          Compute cosine similarity between two sentences/passages.
+        </p>
+        <div className="grid-2">
+          <div className="form-group">
+            <label>Text A</label>
+            <textarea
+              value={textA}
+              onChange={(e) => setTextA(e.target.value)}
+              placeholder="I love pizza so much"
+              rows={4}
+            />
+          </div>
+          <div className="form-group">
+            <label>Text B</label>
+            <textarea
+              value={textB}
+              onChange={(e) => setTextB(e.target.value)}
+              placeholder="I love school so much"
+              rows={4}
+            />
+          </div>
+        </div>
+        <div className="mt-2">
+          <button
+            className="btn btn-primary"
+            onClick={handleCompare}
+            disabled={loading || !textA.trim() || !textB.trim()}
+          >
+            {loading ? "Computing..." : "Compare"}
+          </button>
+        </div>
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {similarity !== null && (
+        <div className="panel">
+          <div className="similarity-gauge">
+            <div className="similarity-value" style={{ color: scoreColor(similarity) }}>
+              {similarity.toFixed(4)}
+            </div>
+            <div className="similarity-label">Cosine Similarity</div>
+            <div style={{ width: "100%", maxWidth: 400, marginTop: 16 }}>
+              <div className="score-bar" style={{ width: "100%", height: 12 }}>
+                <div
+                  className="score-bar-fill"
+                  style={{
+                    width: `${Math.max(0, similarity) * 100}%`,
+                    background: scoreColor(similarity),
+                  }}
+                />
+              </div>
+              <div className="score-bar-legend">
+                <span>0 (unrelated)</span>
+                <span>1 (identical)</span>
+              </div>
+            </div>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/Toggle.tsx ADDED Viewed

	@@ -0,0 +1,27 @@

+interface Option {
+  value: string;
+  label: string;
+}
+interface Props {
+  options: Option[];
+  value: string;
+  onChange: (value: string) => void;
+}
+export default function Toggle({ options, value, onChange }: Props) {
+  return (
+    <div className="toggle">
+      {options.map((opt) => (
+        <button
+          key={opt.value}
+          className={`toggle-option ${opt.value === value ? "toggle-option-active" : ""}`}
+          onClick={() => onChange(opt.value)}
+          type="button"
+        >
+          {opt.label}
+        </button>
+      ))}
+    </div>
+  );
+}

frontend/src/components/TrainingPanel.tsx ADDED Viewed

	@@ -0,0 +1,349 @@

+import { useState } from "react";
+import { api, getErrorMessage } from "../api";
+import type { TrainResponse, QueryResultItem, CompareResponse } from "../types";
+import { useCorpusLoader } from "../hooks/useCorpusLoader";
+import { scoreColor } from "../utils/colors";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+import MetricCard from "./MetricCard";
+import Toggle from "./Toggle";
+import Select from "./Select";
+import LogViewer from "./LogViewer";
+type Strategy = "unsupervised" | "contrastive" | "keywords";
+interface SimilarWord {
+  word: string;
+  score: number;
+}
+const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
+  { id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
+  { id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
+  { id: "keywords", label: "Keyword-supervised", desc: "You provide keyword→meaning map. Best if you know the code words." },
+];
+const MODELS = [
+  { value: "all-MiniLM-L6-v2", label: "all-MiniLM-L6-v2 (fast)" },
+  { value: "all-mpnet-base-v2", label: "all-mpnet-base-v2 (best quality)" },
+];
+export default function TrainingPanel() {
+  // Training
+  const [strategy, setStrategy] = useState<Strategy>("contrastive");
+  const [baseModel, setBaseModel] = useState("all-MiniLM-L6-v2");
+  const [outputPath, setOutputPath] = useState("./trained_model");
+  const [epochs, setEpochs] = useState(5);
+  const [batchSize, setBatchSize] = useState(16);
+  const [keywordMapText, setKeywordMapText] = useState('{\n  "pizza": "school",\n  "pepperoni": "math class"\n}');
+  const [showAdvanced, setShowAdvanced] = useState(false);
+  const [training, setTraining] = useState(false);
+  const [result, setResult] = useState<TrainResponse | null>(null);
+  const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
+  // Similar words
+  const [simWord, setSimWord] = useState("");
+  const [simTopK, setSimTopK] = useState(10);
+  const [simResults, setSimResults] = useState<SimilarWord[]>([]);
+  const [simLoading, setSimLoading] = useState(false);
+  // Compare
+  const [compTextA, setCompTextA] = useState("");
+  const [compTextB, setCompTextB] = useState("");
+  const [compResult, setCompResult] = useState<CompareResponse | null>(null);
+  const [compLoading, setCompLoading] = useState(false);
+  // Search
+  const [queryText, setQueryText] = useState("");
+  const [queryTopK, setQueryTopK] = useState(5);
+  const [queryResults, setQueryResults] = useState<QueryResultItem[]>([]);
+  const [queryLoading, setQueryLoading] = useState(false);
+  const ready = result !== null;
+  async function handleTrain() {
+    setTraining(true); setError(""); setResult(null);
+    try {
+      const corpus = parseCorpus();
+      if (!corpus.length) { setError("Corpus is empty."); setTraining(false); return; }
+      const base = { corpus_texts: corpus, base_model: baseModel, output_path: outputPath, epochs, batch_size: batchSize };
+      let res: TrainResponse;
+      if (strategy === "unsupervised") {
+        res = await api.trainUnsupervised(base);
+      } else if (strategy === "contrastive") {
+        res = await api.trainContrastive(base);
+      } else {
+        const kw = JSON.parse(keywordMapText);
+        res = await api.trainKeywords({ ...base, keyword_meanings: kw });
+      }
+      setResult(res);
+    } catch (e) {
+      setError(e instanceof SyntaxError ? "Invalid JSON in keyword map." : getErrorMessage(e));
+    } finally {
+      setTraining(false);
+    }
+  }
+  async function handleSimilarWords() {
+    setSimLoading(true); setError("");
+    try {
+      const res = await api.similarWords({ word: simWord, top_k: simTopK });
+      setSimResults(res.similar);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setSimLoading(false);
+    }
+  }
+  async function handleCompare() {
+    setCompLoading(true); setError("");
+    try {
+      const res = await api.compare({ text_a: compTextA, text_b: compTextB });
+      setCompResult(res);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setCompLoading(false);
+    }
+  }
+  async function handleQuery() {
+    setQueryLoading(true); setError("");
+    try {
+      const res = await api.query({ text: queryText, top_k: queryTopK });
+      setQueryResults(res.results);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setQueryLoading(false);
+    }
+  }
+  return (
+    <div>
+      {/* 1. Training (strategy + config + corpus merged) */}
+      <div className="panel">
+        <h2>1. Fine-tune Transformer</h2>
+        <p className="panel-desc">
+          Fine-tune a pre-trained sentence transformer on your corpus to improve contextual understanding.
+        </p>
+        <div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
+          <button className="btn btn-secondary" onClick={loadFromEngine}
+            disabled={corpusLoading}>
+            {corpusLoading ? "Loading..." : "Load from Engine"}
+          </button>
+          {corpusText && (
+            <button className="btn btn-secondary" onClick={() => setCorpusText("")}>
+              Clear
+            </button>
+          )}
+        </div>
+        <div className="form-group" style={{ marginBottom: 12 }}>
+          <label>
+            Corpus (separate documents with blank lines)
+            {corpusText && (
+              <span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
+                {" "} — {parseCorpus().length} documents detected
+              </span>
+            )}
+          </label>
+          <textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
+            placeholder="Document 1 text...\n\nDocument 2 text..." />
+        </div>
+        <label className="section-label">Strategy</label>
+        <Toggle
+          options={STRATEGIES.map(s => ({ value: s.id, label: s.label }))}
+          value={strategy}
+          onChange={(v) => setStrategy(v as Strategy)}
+        />
+        <p style={{ color: "var(--text-dim)", fontSize: "0.85rem", marginBottom: 12 }}>
+          {STRATEGIES.find(s => s.id === strategy)?.desc}
+        </p>
+        {strategy === "keywords" && (
+          <div className="form-group" style={{ marginBottom: 12 }}>
+            <label>Keyword → Meaning Map (JSON)</label>
+            <textarea value={keywordMapText} onChange={e => setKeywordMapText(e.target.value)}
+              rows={4} style={{ fontFamily: "monospace", fontSize: "0.8rem" }} />
+          </div>
+        )}
+        <div className="form-row" style={{ marginBottom: 12 }}>
+          <div className="form-group">
+            <label>Base Model</label>
+            <Select options={MODELS} value={baseModel} onChange={setBaseModel} />
+          </div>
+        </div>
+        <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
+          {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
+        </button>
+        {showAdvanced && (
+          <div className="advanced-section">
+            <div className="form-row">
+              <div className="form-group" style={{ maxWidth: 100 }}>
+                <label>Epochs</label>
+                <input type="number" value={epochs} onChange={e => setEpochs(+e.target.value)} min={1} max={50} />
+              </div>
+              <div className="form-group" style={{ maxWidth: 120 }}>
+                <label>Batch Size</label>
+                <input type="number" value={batchSize} onChange={e => setBatchSize(+e.target.value)} min={4} max={128} />
+              </div>
+              <div className="form-group" style={{ maxWidth: 200 }}>
+                <label>Output Path</label>
+                <input value={outputPath} onChange={e => setOutputPath(e.target.value)} />
+              </div>
+            </div>
+          </div>
+        )}
+        <button className="btn btn-primary" onClick={handleTrain}
+          disabled={training || !corpusText.trim()} style={{ marginTop: 8 }}>
+          {training ? <><span className="spinner" /> Training...</> : "Start Training"}
+        </button>
+        <LogViewer active={training} />
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {result && (
+        <div className="panel">
+          <h2>Training Complete</h2>
+          <div className="metric-grid">
+            <MetricCard value={result.training_pairs} label="Training Pairs" />
+            <MetricCard value={result.epochs} label="Epochs" />
+            <MetricCard value={`${result.seconds}s`} label="Time" />
+          </div>
+          <StatusMessage type="ok"
+            message={`Model saved: ${result.model_path} — use this path in the Setup tab.`} />
+        </div>
+      )}
+      {/* 2. Similar Words */}
+      <div className="panel">
+        <h2>2. Similar Words</h2>
+        <p className="panel-desc">
+          Find words that appear in similar contexts using transformer embeddings.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Word</label>
+            <input value={simWord} onChange={e => setSimWord(e.target.value)}
+              onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
+              placeholder="e.g. pizza" />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleSimilarWords}
+              disabled={simLoading || !simWord.trim()}>
+              {simLoading ? "Searching..." : "Find"}
+            </button>
+          </div>
+        </div>
+        {simResults.length > 0 && (
+          <table className="data-table" style={{ marginTop: 12 }}>
+            <thead>
+              <tr><th>Word</th><th>Similarity</th></tr>
+            </thead>
+            <tbody>
+              {simResults.map((r, i) => (
+                <tr key={i}>
+                  <td style={{ fontWeight: 600 }}>{r.word}</td>
+                  <td><ScoreBar score={r.score} /></td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        )}
+      </div>
+      {/* 3. Compare Texts */}
+      <div className="panel">
+        <h2>3. Compare Texts</h2>
+        <p className="panel-desc">
+          Sentence similarity via transformer contextual embeddings.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Text A</label>
+            <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
+              placeholder="pizza gives me homework" />
+          </div>
+          <div className="form-group">
+            <label>Text B</label>
+            <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
+              placeholder="school gives me homework" />
+          </div>
+        </div>
+        <button className="btn btn-primary" onClick={handleCompare}
+          disabled={compLoading || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
+          {compLoading ? "Comparing..." : "Compare"}
+        </button>
+        {compResult && (
+          <div className="similarity-gauge" style={{ marginTop: 16 }}>
+            <div className="similarity-value"
+              style={{ color: scoreColor(compResult.similarity) }}>
+              {compResult.similarity.toFixed(4)}
+            </div>
+            <div className="similarity-label">Transformer Cosine Similarity</div>
+          </div>
+        )}
+      </div>
+      {/* 4. Semantic Search */}
+      <div className="panel">
+        <h2>4. Semantic Search</h2>
+        <p className="panel-desc">
+          Search your corpus using transformer embeddings.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Query</label>
+            <input value={queryText} onChange={e => setQueryText(e.target.value)}
+              onKeyDown={e => e.key === "Enter" && handleQuery()}
+              placeholder="a place where children learn" />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleQuery}
+              disabled={queryLoading || !queryText.trim()}>
+              {queryLoading ? "Searching..." : "Search"}
+            </button>
+          </div>
+        </div>
+        {queryResults.length > 0 && (
+          <div style={{ marginTop: 12 }}>
+            {queryResults.map((r, i) => (
+              <div key={i} className="result-card">
+                <div className="result-header">
+                  <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
+                  <ScoreBar score={r.score} />
+                </div>
+                <div className="result-text">{r.text}</div>
+              </div>
+            ))}
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}

frontend/src/components/Word2VecPanel.tsx ADDED Viewed

	@@ -0,0 +1,293 @@

+import { useState } from "react";
+import { api, getErrorMessage } from "../api";
+import type { W2VInitResponse, W2VQueryResult, W2VSimilarWord, CompareResponse } from "../types";
+import { useCorpusLoader } from "../hooks/useCorpusLoader";
+import { scoreColor } from "../utils/colors";
+import ScoreBar from "./ScoreBar";
+import StatusMessage from "./StatusMessage";
+import LogViewer from "./LogViewer";
+import MetricCard from "./MetricCard";
+export default function Word2VecPanel() {
+  // Init
+  const [vectorSize, setVectorSize] = useState(100);
+  const [windowSize, setWindowSize] = useState(5);
+  const [w2vEpochs, setW2vEpochs] = useState(50);
+  const [showAdvanced, setShowAdvanced] = useState(false);
+  const [initLoading, setInitLoading] = useState(false);
+  const [initResult, setInitResult] = useState<W2VInitResponse | null>(null);
+  const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
+  // Similar words
+  const [simWord, setSimWord] = useState("");
+  const [simTopK, setSimTopK] = useState(10);
+  const [simResults, setSimResults] = useState<W2VSimilarWord[]>([]);
+  const [simLoading, setSimLoading] = useState(false);
+  // Compare
+  const [compTextA, setCompTextA] = useState("");
+  const [compTextB, setCompTextB] = useState("");
+  const [compResult, setCompResult] = useState<CompareResponse | null>(null);
+  const [compLoading, setCompLoading] = useState(false);
+  // Query
+  const [queryText, setQueryText] = useState("");
+  const [queryTopK, setQueryTopK] = useState(5);
+  const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
+  const [queryLoading, setQueryLoading] = useState(false);
+  async function handleInit() {
+    setInitLoading(true); setError(""); setInitResult(null);
+    try {
+      const corpus = parseCorpus();
+      if (!corpus.length) { setError("Corpus is empty."); setInitLoading(false); return; }
+      const res = await api.w2vInit({
+        corpus_texts: corpus,
+        vector_size: vectorSize,
+        window: windowSize,
+        epochs: w2vEpochs,
+      });
+      setInitResult(res);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setInitLoading(false);
+    }
+  }
+  async function handleSimilarWords() {
+    setSimLoading(true); setError("");
+    try {
+      const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
+      setSimResults(res.similar);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setSimLoading(false);
+    }
+  }
+  async function handleCompare() {
+    setCompLoading(true); setError("");
+    try {
+      const res = await api.w2vCompare({ text_a: compTextA, text_b: compTextB });
+      setCompResult(res);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setCompLoading(false);
+    }
+  }
+  async function handleQuery() {
+    setQueryLoading(true); setError("");
+    try {
+      const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
+      setQueryResults(res.results);
+    } catch (err) {
+      setError(getErrorMessage(err));
+    } finally {
+      setQueryLoading(false);
+    }
+  }
+  const ready = initResult !== null;
+  return (
+    <div>
+      {/* 1. Training */}
+      <div className="panel">
+        <h2>1. Train Word2Vec (gensim)</h2>
+        <p className="panel-desc">
+          Static embeddings — one vector per word, no context awareness.
+          Useful as a baseline to compare against the transformer approach.
+        </p>
+        <div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
+          <button className="btn btn-secondary" onClick={loadFromEngine}
+            disabled={corpusLoading}>
+            {corpusLoading ? "Loading..." : "Load from Engine"}
+          </button>
+          {corpusText && (
+            <button className="btn btn-secondary" onClick={() => setCorpusText("")}>
+              Clear
+            </button>
+          )}
+        </div>
+        <div className="form-group" style={{ marginBottom: 12 }}>
+          <label>
+            Corpus (separate documents with blank lines)
+            {corpusText && (
+              <span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
+                {" "} — {parseCorpus().length} documents detected
+              </span>
+            )}
+          </label>
+          <textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
+            placeholder="Document 1 text...\n\nDocument 2 text..." />
+        </div>
+        <button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
+          {showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
+        </button>
+        {showAdvanced && (
+          <div className="advanced-section">
+            <div className="form-row">
+              <div className="form-group" style={{ maxWidth: 120 }}>
+                <label>Vector Size</label>
+                <input type="number" value={vectorSize} onChange={e => setVectorSize(+e.target.value)} min={50} max={300} />
+              </div>
+              <div className="form-group" style={{ maxWidth: 120 }}>
+                <label>Window</label>
+                <input type="number" value={windowSize} onChange={e => setWindowSize(+e.target.value)} min={2} max={15} />
+              </div>
+              <div className="form-group" style={{ maxWidth: 120 }}>
+                <label>Epochs</label>
+                <input type="number" value={w2vEpochs} onChange={e => setW2vEpochs(+e.target.value)} min={5} max={200} />
+              </div>
+            </div>
+          </div>
+        )}
+        <button className="btn btn-primary" onClick={handleInit}
+          disabled={initLoading || !corpusText.trim()} style={{ marginTop: 8 }}>
+          {initLoading ? <><span className="spinner" /> Training...</> : "Train Word2Vec"}
+        </button>
+        <LogViewer active={initLoading} />
+      </div>
+      {error && <StatusMessage type="err" message={error} />}
+      {initResult && (
+        <div className="panel">
+          <h2>Word2Vec Ready</h2>
+          <div className="metric-grid">
+            <MetricCard value={initResult.vocab_size} label="Vocabulary" />
+            <MetricCard value={initResult.sentences} label="Sentences" />
+            <MetricCard value={initResult.vector_size} label="Dimensions" />
+            <MetricCard value={`${initResult.seconds}s`} label="Time" />
+          </div>
+        </div>
+      )}
+      {/* 2. Similar Words */}
+      <div className="panel">
+        <h2>2. Similar Words</h2>
+        <p className="panel-desc">
+          Find words that appear in similar contexts using Word2Vec static embeddings.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Word</label>
+            <input value={simWord} onChange={e => setSimWord(e.target.value)}
+              onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
+              placeholder="e.g. pizza" />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleSimilarWords}
+              disabled={simLoading || !ready || !simWord.trim()}>
+              {simLoading ? "Searching..." : "Find"}
+            </button>
+          </div>
+        </div>
+        {simResults.length > 0 && (
+          <table className="data-table" style={{ marginTop: 12 }}>
+            <thead>
+              <tr><th>Word</th><th>Similarity</th></tr>
+            </thead>
+            <tbody>
+              {simResults.map((r, i) => (
+                <tr key={i}>
+                  <td style={{ fontWeight: 600 }}>{r.word}</td>
+                  <td><ScoreBar score={r.score} /></td>
+                </tr>
+              ))}
+            </tbody>
+          </table>
+        )}
+      </div>
+      {/* 3. Compare Texts */}
+      <div className="panel">
+        <h2>3. Compare Texts</h2>
+        <p className="panel-desc">
+          Sentence similarity via averaged word vectors.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Text A</label>
+            <input value={compTextA} onChange={e => setCompTextA(e.target.value)}
+              placeholder="pizza gives me homework" />
+          </div>
+          <div className="form-group">
+            <label>Text B</label>
+            <input value={compTextB} onChange={e => setCompTextB(e.target.value)}
+              placeholder="school gives me homework" />
+          </div>
+        </div>
+        <button className="btn btn-primary" onClick={handleCompare}
+          disabled={compLoading || !ready || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
+          {compLoading ? "Comparing..." : "Compare"}
+        </button>
+        {compResult && (
+          <div className="similarity-gauge" style={{ marginTop: 16 }}>
+            <div className="similarity-value"
+              style={{ color: scoreColor(compResult.similarity) }}>
+              {compResult.similarity.toFixed(4)}
+            </div>
+            <div className="similarity-label">Word2Vec Cosine Similarity</div>
+          </div>
+        )}
+      </div>
+      {/* 4. Semantic Search */}
+      <div className="panel">
+        <h2>4. Semantic Search</h2>
+        <p className="panel-desc">
+          Search your corpus using averaged Word2Vec vectors.
+        </p>
+        <div className="form-row">
+          <div className="form-group">
+            <label>Query</label>
+            <input value={queryText} onChange={e => setQueryText(e.target.value)}
+              onKeyDown={e => e.key === "Enter" && handleQuery()}
+              placeholder="a place where children learn" />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>Top K</label>
+            <input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
+          </div>
+          <div className="form-group form-group-sm">
+            <label>&nbsp;</label>
+            <button className="btn btn-primary" onClick={handleQuery}
+              disabled={queryLoading || !ready || !queryText.trim()}>
+              {queryLoading ? "Searching..." : "Search"}
+            </button>
+          </div>
+        </div>
+        {queryResults.length > 0 && (
+          <div style={{ marginTop: 12 }}>
+            {queryResults.map((r, i) => (
+              <div key={i} className="result-card">
+                <div className="result-header">
+                  <span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
+                  <ScoreBar score={r.score} />
+                </div>
+                <div className="result-text">{r.text}</div>
+              </div>
+            ))}
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}

frontend/src/hooks/useApiCall.ts ADDED Viewed

	@@ -0,0 +1,34 @@

+import { useState, useCallback } from "react";
+import { getErrorMessage } from "../api";
+/**
+ * Generic hook for API calls with loading/error/result state.
+ * Eliminates the repeated try/catch/setLoading/setError pattern.
+ */
+export function useApiCall<T>() {
+  const [data, setData] = useState<T | null>(null);
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState("");
+  const run = useCallback(async (fn: () => Promise<T>): Promise<T | null> => {
+    setLoading(true);
+    setError("");
+    try {
+      const result = await fn();
+      setData(result);
+      return result;
+    } catch (err) {
+      setError(getErrorMessage(err));
+      return null;
+    } finally {
+      setLoading(false);
+    }
+  }, []);
+  const clear = useCallback(() => {
+    setData(null);
+    setError("");
+  }, []);
+  return { data, loading, error, setError, run, clear };
+}

frontend/src/hooks/useCorpusLoader.ts ADDED Viewed

	@@ -0,0 +1,48 @@

+import { useState } from "react";
+import { api, getErrorMessage } from "../api";
+/**
+ * Shared hook for loading corpus text from the engine and parsing it into documents.
+ * Used by both TrainingPanel and Word2VecPanel.
+ */
+export function useCorpusLoader() {
+  const [corpusText, setCorpusText] = useState("");
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState("");
+  function parseCorpus(): string[] {
+    return corpusText
+      .split(/\n{2,}/)
+      .map((t) => t.trim())
+      .filter((t) => t.length > 20);
+  }
+  async function loadFromEngine() {
+    setLoading(true);
+    setError("");
+    try {
+      const res = await api.getCorpusTexts();
+      if (res.documents.length === 0) {
+        setError("No documents loaded in the engine. Load a dataset first.");
+        return;
+      }
+      setCorpusText(
+        res.documents.map((d: { doc_id: string; text: string }) => d.text).join("\n\n")
+      );
+    } catch (e) {
+      setError(getErrorMessage(e));
+    } finally {
+      setLoading(false);
+    }
+  }
+  return {
+    corpusText,
+    setCorpusText,
+    loading,
+    error,
+    setError,
+    parseCorpus,
+    loadFromEngine,
+  };
+}

frontend/src/main.tsx ADDED Viewed

	@@ -0,0 +1,9 @@

+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import App from "./App";
+createRoot(document.getElementById("root")!).render(
+  <StrictMode>
+    <App />
+  </StrictMode>
+);

frontend/src/styles.css ADDED Viewed

	@@ -0,0 +1,828 @@

+/* ---- Reset & Base ---- */
+*,
+*::before,
+*::after {
+  box-sizing: border-box;
+  margin: 0;
+  padding: 0;
+}
+:root {
+  --bg: #0f1117;
+  --surface: #1a1d27;
+  --surface2: #232733;
+  --border: #2e3340;
+  --text: #e1e4eb;
+  --text-dim: #8b90a0;
+  --accent: #6c8cff;
+  --accent-dim: #4a64cc;
+  --ok: #4ade80;
+  --warn: #facc15;
+  --err: #f87171;
+  --radius: 8px;
+}
+body {
+  font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+  background: var(--bg);
+  color: var(--text);
+  line-height: 1.6;
+}
+/* ---- App Layout ---- */
+.app {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 24px;
+}
+.app-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  flex-wrap: wrap;
+  gap: 12px;
+  margin-bottom: 24px;
+}
+.app-header h1 {
+  font-size: 1.5rem;
+  font-weight: 700;
+  color: var(--accent);
+}
+.header-stats {
+  display: flex;
+  gap: 8px;
+  flex-wrap: wrap;
+}
+/* ---- Badges ---- */
+.badge {
+  padding: 4px 10px;
+  border-radius: 12px;
+  font-size: 0.75rem;
+  font-weight: 600;
+  background: var(--surface2);
+  color: var(--text-dim);
+}
+.badge-ok {
+  background: #1a3a2a;
+  color: var(--ok);
+}
+.badge-warn {
+  background: #3a3520;
+  color: var(--warn);
+}
+/* ---- Progress Stepper ---- */
+.stepper {
+  display: flex;
+  align-items: flex-start;
+  justify-content: center;
+  margin-bottom: 28px;
+  padding: 0 24px;
+}
+.stepper-item {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 6px;
+  position: relative;
+  z-index: 1;
+}
+.stepper-line {
+  flex: 1;
+  height: 2px;
+  background: var(--border);
+  margin-top: 16px;
+  min-width: 40px;
+}
+.stepper-line-active {
+  background: var(--accent-dim);
+}
+.stepper-circle {
+  width: 34px;
+  height: 34px;
+  border-radius: 50%;
+  border: 2px solid var(--border);
+  background: var(--surface);
+  color: var(--text-dim);
+  font-weight: 700;
+  font-size: 0.85rem;
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: all 0.2s;
+}
+.stepper-circle:hover:not(:disabled) {
+  border-color: var(--accent);
+  color: var(--accent);
+}
+.stepper-circle.stepper-active {
+  border-color: var(--accent);
+  background: var(--accent);
+  color: #fff;
+}
+.stepper-circle.stepper-done {
+  border-color: var(--ok);
+  background: #1a3a2a;
+  color: var(--ok);
+}
+.stepper-circle:disabled {
+  opacity: 0.35;
+  cursor: not-allowed;
+}
+.stepper-label {
+  font-size: 0.75rem;
+  color: var(--text-dim);
+  white-space: nowrap;
+  font-weight: 500;
+}
+.stepper-label-active {
+  color: var(--accent);
+  font-weight: 600;
+}
+/* ---- Sub-tabs ---- */
+.subtabs {
+  display: flex;
+  gap: 2px;
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 3px;
+  margin-bottom: 20px;
+  overflow-x: auto;
+}
+.subtab {
+  padding: 7px 16px;
+  background: none;
+  border: none;
+  border-radius: 6px;
+  color: var(--text-dim);
+  cursor: pointer;
+  font-size: 0.8rem;
+  font-weight: 500;
+  white-space: nowrap;
+  transition: all 0.15s;
+}
+.subtab:hover {
+  color: var(--text);
+  background: var(--surface2);
+}
+.subtab-active {
+  color: #fff;
+  background: var(--accent);
+  font-weight: 600;
+}
+/* ---- Collapsible Toggle ---- */
+.collapsible-toggle {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  width: 100%;
+  padding: 14px 16px;
+  margin: 16px 0;
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  color: var(--text-dim);
+  font-size: 0.85rem;
+  font-weight: 500;
+  cursor: pointer;
+  transition: color 0.15s, border-color 0.15s;
+}
+.collapsible-toggle:hover {
+  color: var(--text);
+  border-color: var(--accent-dim);
+}
+.collapsible-arrow {
+  font-size: 0.75rem;
+}
+/* ---- Advanced Settings Toggle ---- */
+.advanced-toggle {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  padding: 0;
+  margin: 12px 0 0;
+  background: none;
+  border: none;
+  color: var(--text-dim);
+  font-size: 0.8rem;
+  font-weight: 500;
+  cursor: pointer;
+  transition: color 0.15s;
+}
+.advanced-toggle:hover {
+  color: var(--accent);
+}
+.advanced-section {
+  padding-top: 8px;
+  margin-bottom: 12px;
+}
+/* ---- Toggle (segmented control) ---- */
+.toggle {
+  display: inline-flex;
+  gap: 2px;
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 3px;
+}
+.toggle-option {
+  padding: 6px 14px;
+  background: none;
+  border: none;
+  border-radius: 6px;
+  color: var(--text-dim);
+  font-size: 0.8rem;
+  font-weight: 500;
+  cursor: pointer;
+  transition: all 0.15s;
+  white-space: nowrap;
+}
+.toggle-option:hover {
+  color: var(--text);
+}
+.toggle-option-active {
+  background: var(--accent);
+  color: #fff;
+  font-weight: 600;
+}
+/* ---- Switch (on/off) ---- */
+.switch {
+  display: inline-flex;
+  align-items: center;
+  gap: 8px;
+  cursor: pointer;
+}
+.switch-track {
+  position: relative;
+  width: 40px;
+  height: 22px;
+  border-radius: 11px;
+  background: var(--border);
+  border: none;
+  cursor: pointer;
+  padding: 0;
+  transition: background 0.2s;
+}
+.switch-track-on {
+  background: var(--accent);
+}
+.switch-thumb {
+  position: absolute;
+  top: 2px;
+  left: 2px;
+  width: 18px;
+  height: 18px;
+  border-radius: 50%;
+  background: #fff;
+  transition: transform 0.2s;
+}
+.switch-track-on .switch-thumb {
+  transform: translateX(18px);
+}
+.switch-label {
+  font-size: 0.8rem;
+  color: var(--text-dim);
+  font-weight: 500;
+  user-select: none;
+}
+/* ---- Custom Select ---- */
+.custom-select {
+  position: relative;
+  min-width: 180px;
+}
+.custom-select-trigger {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  width: 100%;
+  padding: 8px 12px;
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  color: var(--text);
+  font-size: 0.875rem;
+  font-family: inherit;
+  cursor: pointer;
+  transition: border-color 0.15s;
+  text-align: left;
+}
+.custom-select-trigger:hover,
+.custom-select-trigger:focus {
+  border-color: var(--accent);
+  outline: none;
+}
+.custom-select-arrow {
+  font-size: 0.7rem;
+  color: var(--text-dim);
+  margin-left: 8px;
+}
+.custom-select-dropdown {
+  position: absolute;
+  top: calc(100% + 4px);
+  left: 0;
+  right: 0;
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 4px;
+  z-index: 100;
+  max-height: 240px;
+  overflow-y: auto;
+  box-shadow: 0 8px 24px rgba(0, 0, 0, 0.4);
+}
+.custom-select-option {
+  display: block;
+  width: 100%;
+  padding: 8px 10px;
+  background: none;
+  border: none;
+  border-radius: 6px;
+  color: var(--text-dim);
+  font-size: 0.85rem;
+  font-family: inherit;
+  cursor: pointer;
+  text-align: left;
+  transition: all 0.1s;
+}
+.custom-select-option:hover {
+  background: var(--surface2);
+  color: var(--text);
+}
+.custom-select-option-active {
+  background: var(--accent);
+  color: #fff;
+}
+.custom-select-option-active:hover {
+  background: var(--accent-dim);
+  color: #fff;
+}
+/* ---- Server Error Banner ---- */
+.server-error-banner {
+  background: #3a1a1a;
+  color: var(--err);
+  border: 1px solid #5a2a2a;
+  border-radius: var(--radius);
+  padding: 12px 16px;
+  margin-bottom: 20px;
+  font-size: 0.85rem;
+  line-height: 1.5;
+}
+/* ---- Content ---- */
+.content {
+  min-height: 400px;
+}
+/* ---- Cards / Panels ---- */
+.panel {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 20px;
+  margin-bottom: 16px;
+}
+.panel h2 {
+  font-size: 1.1rem;
+  font-weight: 600;
+  margin-bottom: 12px;
+}
+.panel h3 {
+  font-size: 0.95rem;
+  font-weight: 600;
+  margin-bottom: 8px;
+  color: var(--text-dim);
+}
+/* ---- Forms ---- */
+.form-row {
+  display: flex;
+  gap: 12px;
+  margin-bottom: 12px;
+  flex-wrap: wrap;
+  align-items: flex-end;
+}
+.form-group {
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+  flex: 1;
+  min-width: 180px;
+}
+.form-group label {
+  font-size: 0.8rem;
+  font-weight: 500;
+  color: var(--text-dim);
+}
+input,
+textarea,
+select {
+  padding: 8px 12px;
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  color: var(--text);
+  font-size: 0.875rem;
+  font-family: inherit;
+}
+input:focus,
+textarea:focus,
+select:focus {
+  outline: none;
+  border-color: var(--accent);
+}
+textarea {
+  resize: vertical;
+  min-height: 100px;
+}
+/* ---- Buttons ---- */
+button.btn {
+  padding: 8px 20px;
+  border: none;
+  border-radius: var(--radius);
+  font-size: 0.875rem;
+  font-weight: 600;
+  cursor: pointer;
+  transition: background 0.15s, opacity 0.15s;
+}
+.btn-primary {
+  background: var(--accent);
+  color: #fff;
+}
+.btn-primary:hover:not(:disabled) {
+  background: var(--accent-dim);
+}
+.btn-secondary {
+  background: var(--surface2);
+  color: var(--text);
+}
+.btn-secondary:hover:not(:disabled) {
+  background: var(--border);
+}
+button:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+/* ---- Results ---- */
+.result-card {
+  background: var(--surface2);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 16px;
+  margin-bottom: 10px;
+  transition: border-color 0.15s;
+}
+.result-card:hover {
+  border-color: var(--accent-dim);
+}
+.result-card .result-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  margin-bottom: 8px;
+  gap: 8px;
+}
+.result-card-selected {
+  border-color: var(--accent);
+}
+.result-card .result-text {
+  font-size: 0.85rem;
+  color: var(--text-dim);
+  line-height: 1.6;
+}
+.score-bar-container {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+}
+.score-bar {
+  width: 120px;
+  height: 6px;
+  background: var(--bg);
+  border-radius: 3px;
+  overflow: hidden;
+}
+.score-bar-fill {
+  height: 100%;
+  border-radius: 3px;
+  transition: width 0.3s;
+}
+.score-label {
+  font-size: 0.8rem;
+  font-weight: 700;
+  font-variant-numeric: tabular-nums;
+  min-width: 48px;
+  text-align: right;
+}
+/* ---- Similarity gauge ---- */
+.similarity-gauge {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  flex-direction: column;
+  padding: 24px;
+}
+.similarity-value {
+  font-size: 3rem;
+  font-weight: 800;
+  font-variant-numeric: tabular-nums;
+}
+.similarity-label {
+  font-size: 0.9rem;
+  color: var(--text-dim);
+  margin-top: 4px;
+}
+/* ---- Status / Alerts ---- */
+.status {
+  padding: 10px 14px;
+  border-radius: var(--radius);
+  font-size: 0.85rem;
+  margin-bottom: 12px;
+}
+.status-ok {
+  background: #1a3a2a;
+  color: var(--ok);
+}
+.status-err {
+  background: #3a1a1a;
+  color: var(--err);
+}
+.status-loading {
+  background: var(--surface2);
+  color: var(--text-dim);
+}
+/* ---- Table ---- */
+.data-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 0.85rem;
+}
+.data-table th,
+.data-table td {
+  padding: 8px 12px;
+  text-align: left;
+  border-bottom: 1px solid var(--border);
+}
+.data-table th {
+  color: var(--text-dim);
+  font-weight: 600;
+  font-size: 0.8rem;
+  text-transform: uppercase;
+  letter-spacing: 0.5px;
+}
+.data-table tr:hover td {
+  background: var(--surface2);
+}
+.data-table input,
+.data-table select {
+  font-size: 0.85rem;
+}
+/* ---- Grid ---- */
+.grid-2 {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 16px;
+}
+@media (max-width: 768px) {
+  .grid-2 {
+    grid-template-columns: 1fr;
+  }
+}
+/* ---- Metric Card ---- */
+.metric-card {
+  background: var(--surface2);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 16px;
+  text-align: center;
+}
+.metric-value {
+  font-size: 1.75rem;
+  font-weight: 700;
+  color: var(--text);
+  font-variant-numeric: tabular-nums;
+}
+.metric-label {
+  font-size: 0.78rem;
+  color: var(--text-dim);
+  margin-top: 4px;
+  text-transform: uppercase;
+  letter-spacing: 0.3px;
+}
+/* ---- Spinner ---- */
+.spinner {
+  display: inline-block;
+  width: 16px;
+  height: 16px;
+  border: 2px solid var(--text-dim);
+  border-top-color: var(--accent);
+  border-radius: 50%;
+  animation: spin 0.6s linear infinite;
+  margin-right: 6px;
+  vertical-align: middle;
+}
+@keyframes spin {
+  to {
+    transform: rotate(360deg);
+  }
+}
+/* ---- Tags ---- */
+.tag {
+  display: inline-block;
+  padding: 2px 8px;
+  border-radius: 4px;
+  font-size: 0.75rem;
+  font-weight: 600;
+  background: var(--surface);
+  margin: 2px;
+}
+.tag-best {
+  background: #1a3a2a;
+  color: var(--ok);
+}
+/* ---- Utility classes ---- */
+.panel-desc {
+  color: var(--text-dim);
+  font-size: 0.85rem;
+  margin-bottom: 12px;
+}
+.section-label {
+  font-size: 0.8rem;
+  font-weight: 600;
+  color: var(--text-dim);
+  margin-bottom: 4px;
+}
+.text-dim { color: var(--text-dim); }
+.form-group-sm { max-width: 100px; }
+.form-group-md { max-width: 140px; }
+.form-group-lg { max-width: 220px; }
+.metric-grid {
+  display: flex;
+  gap: 16px;
+  flex-wrap: wrap;
+}
+.metric-grid > * {
+  flex: 1 1 100px;
+}
+.flex-row { display: flex; gap: 8px; }
+.flex-col { display: flex; flex-direction: column; }
+.flex-wrap { flex-wrap: wrap; }
+.gap-1 { gap: 8px; }
+.gap-2 { gap: 12px; }
+.gap-3 { gap: 16px; }
+.mt-1 { margin-top: 8px; }
+.mt-2 { margin-top: 12px; }
+.mt-3 { margin-top: 16px; }
+.mb-1 { margin-bottom: 8px; }
+.mb-2 { margin-bottom: 12px; }
+.mb-3 { margin-bottom: 16px; }
+/* ---- Context Analysis bar chart ---- */
+.context-bar-row {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  margin-bottom: 6px;
+}
+.context-bar-label {
+  width: 90px;
+  font-size: 0.82rem;
+  font-weight: 600;
+  text-align: right;
+  color: var(--text);
+  flex-shrink: 0;
+}
+.context-bar-track {
+  flex: 1;
+  height: 8px;
+  background: var(--bg);
+  border-radius: 4px;
+  overflow: hidden;
+}
+.context-bar-fill {
+  height: 100%;
+  background: var(--accent);
+  border-radius: 4px;
+  transition: width 0.3s;
+}
+.context-bar-value {
+  font-size: 0.75rem;
+  color: var(--text-dim);
+  width: 40px;
+  text-align: right;
+  flex-shrink: 0;
+}
+.context-snippet {
+  font-size: 0.8rem;
+  color: var(--text-dim);
+  line-height: 1.5;
+  padding: 8px 10px;
+  background: var(--bg);
+  border-radius: 6px;
+  margin-bottom: 4px;
+}
+.context-snippet-source {
+  font-size: 0.7rem;
+  color: var(--accent);
+  margin-right: 6px;
+}
+.score-bar-legend {
+  display: flex;
+  justify-content: space-between;
+  font-size: 0.75rem;
+  color: var(--text-dim);
+  margin-top: 4px;
+}

frontend/src/types.ts ADDED Viewed

	@@ -0,0 +1,302 @@

+// ---- API Request types ----
+export interface InitRequest {
+  model_name: string;
+  chunk_size: number;
+  chunk_overlap: number;
+  batch_size: number;
+}
+export interface DocumentRequest {
+  doc_id: string;
+  text: string;
+}
+export interface QueryRequest {
+  text: string;
+  top_k: number;
+}
+export interface CompareRequest {
+  text_a: string;
+  text_b: string;
+}
+export interface KeywordAnalysisRequest {
+  keyword: string;
+  top_k: number;
+  cluster_threshold: number;
+}
+export interface KeywordMatchRequest {
+  keyword: string;
+  candidate_meanings: string[];
+}
+export interface BatchAnalysisRequest {
+  keywords: string[];
+  top_k: number;
+  cluster_threshold: number;
+  compare_across: boolean;
+}
+// ---- API Response types ----
+export interface ChunkPreview {
+  index: number;
+  text: string;
+}
+export interface InitResponse {
+  status: string;
+  model: string;
+  load_time_seconds: number;
+}
+export interface AddDocResponse {
+  status: string;
+  doc_id: string;
+  num_chunks: number;
+  chunks_preview: ChunkPreview[];
+}
+export interface BuildIndexResponse {
+  status: string;
+  total_chunks: number;
+  embedding_dim: number;
+  build_time_seconds: number;
+}
+export interface QueryResultItem {
+  rank: number;
+  score: number;
+  doc_id: string;
+  chunk_index: number;
+  text: string;
+}
+export interface QueryResponse {
+  query: string;
+  results: QueryResultItem[];
+}
+export interface CompareResponse {
+  text_a: string;
+  text_b: string;
+  similarity: number;
+}
+export interface ClusterContext {
+  doc_id: string;
+  chunk_index: number;
+  text: string;
+  highlight_positions: [number, number][];
+}
+export interface SimilarPassage {
+  rank: number;
+  score: number;
+  doc_id: string;
+  text: string;
+}
+export interface MeaningCluster {
+  cluster_id: number;
+  size: number;
+  representative_text: string;
+  contexts: ClusterContext[];
+  similar_passages: SimilarPassage[];
+}
+export interface KeywordAnalysisResponse {
+  keyword: string;
+  total_occurrences: number;
+  meaning_clusters: MeaningCluster[];
+  cross_keyword_similarities: Record<string, number>;
+}
+export interface MatchResult {
+  doc_id: string;
+  chunk_index: number;
+  text: string;
+  best_match: string;
+  best_score: number;
+  all_scores: Record<string, number>;
+}
+export interface MatchResponse {
+  keyword: string;
+  candidate_meanings: string[];
+  matches: MatchResult[];
+}
+export interface CorpusStats {
+  total_chunks: number;
+  total_documents: number;
+  document_ids: string[];
+  index_built: boolean;
+  embedding_dim: number;
+  model_name: string;
+}
+export interface SimilarityDistribution {
+  sample_size: number;
+  mean: number;
+  std: number;
+  min: number;
+  max: number;
+  percentiles: Record<string, number>;
+  histogram: { bin_start: number; bin_end: number; count: number }[];
+}
+export interface DisambiguationMetric {
+  keyword: string;
+  accuracy: number;
+  weighted_f1: number;
+  per_meaning_precision: Record<string, number>;
+  per_meaning_recall: Record<string, number>;
+  per_meaning_f1: Record<string, number>;
+  confusion_matrix: number[][];
+  total_samples: number;
+}
+export interface RetrievalMetric {
+  query: string;
+  mrr: number;
+  precision_at_k: Record<string, number>;
+  recall_at_k: Record<string, number>;
+  ndcg_at_k: Record<string, number>;
+  avg_similarity: number;
+  top_score: number;
+}
+// ---- Training types ----
+export interface TrainResponse {
+  strategy: string;
+  model_path: string;
+  training_pairs: number;
+  epochs: number;
+  seconds: number;
+  keywords?: string[];
+}
+export interface TrainEvalResponse {
+  pairs: {
+    text_a: string;
+    text_b: string;
+    expected: number;
+    base_score: number;
+    trained_score: number;
+    base_error: number;
+    trained_error: number;
+  }[];
+  summary: {
+    avg_base_error: number;
+    avg_trained_error: number;
+    error_reduction_pct: number;
+    improved: number;
+    degraded: number;
+    total: number;
+  };
+}
+// ---- Word2Vec types ----
+export interface W2VInitResponse {
+  vocab_size: number;
+  sentences: number;
+  vector_size: number;
+  seconds: number;
+}
+export interface W2VQueryResult {
+  rank: number;
+  score: number;
+  doc_id: string;
+  text: string;
+}
+export interface W2VSimilarWord {
+  word: string;
+  score: number;
+}
+// ---- Dataset types ----
+export interface DatasetSourceInfo {
+  dataset_id: string;
+  url: string;
+  description: string;
+  columns?: string[];
+  size_mb?: number;
+  model?: string;
+  vector_dim?: number;
+}
+export interface DatasetInfo {
+  raw_texts: DatasetSourceInfo;
+  embeddings: DatasetSourceInfo;
+}
+export interface DatasetLoadRequest {
+  source: "raw" | "embeddings";
+  max_docs: number;
+  min_text_length: number;
+  source_filter?: string;
+  build_index: boolean;
+}
+export interface DatasetLoadResponse {
+  documents_loaded?: number;
+  documents_skipped?: number;
+  documents_created?: number;
+  total_chunks?: number;
+  chunks_indexed?: number;
+  chromadb_vectors?: number;
+  index_built: boolean;
+  seconds: number;
+  source?: string;
+}
+export interface DatasetPreviewDoc {
+  doc_id: string;
+  filename: string;
+  text_preview: string;
+  text_length: number;
+}
+export interface DatasetPreviewResponse {
+  count: number;
+  documents: DatasetPreviewDoc[];
+}
+// ---- Context Analysis types ----
+export interface ContextAssociatedWord {
+  word: string;
+  score: number;
+}
+export interface ContextExample {
+  doc_id: string;
+  snippet: string;
+}
+export interface ContextMeaning {
+  cluster_id: number;
+  occurrences: number;
+  confidence: number;
+  associated_words: ContextAssociatedWord[];
+  example_contexts: ContextExample[];
+}
+export interface ContextAnalysisResponse {
+  keyword: string;
+  total_occurrences: number;
+  meanings: ContextMeaning[];
+}
+// ---- UI State ----
+export type EvalSection = "distribution" | "disambiguation" | "retrieval";

frontend/src/utils/colors.ts ADDED Viewed

	@@ -0,0 +1,6 @@

+/** Map a 0–1 similarity/score to a CSS color variable. */
+export function scoreColor(score: number): string {
+  if (score >= 0.7) return "var(--ok)";
+  if (score >= 0.4) return "var(--warn)";
+  return "var(--err)";
+}

frontend/src/vite-env.d.ts ADDED Viewed

	@@ -0,0 +1 @@


1	+ /// <reference types="vite/client" />

frontend/tsconfig.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "useDefineForClassFields": true,
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noFallthroughCasesInSwitch": true,
+    "forceConsistentCasingInFileNames": true
+  },
+  "include": ["src"]
+}

frontend/vite.config.ts ADDED Viewed

	@@ -0,0 +1,15 @@

+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+export default defineConfig({
+  plugins: [react()],
+  server: {
+    proxy: {
+      "/api/logs/stream": {
+        target: "http://localhost:8000",
+        headers: { "Accept": "text/event-stream" },
+      },
+      "/api": "http://localhost:8000",
+    },
+  },
+});

pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[project]
+name = "esfiles-ndr"
+version = "1.0.0"
+description = "Contextual word similarity analysis using transformer embeddings and Word2Vec baseline"
+requires-python = ">=3.11"
+dependencies = [
+    "sentence-transformers>=5.2.3",
+    "faiss-cpu>=1.13.2",
+    "torch>=2.10.0",
+    "numpy>=2.4.3",
+    "scikit-learn>=1.8.0",
+    "tqdm>=4.67.3",
+    "gensim>=4.4.0",
+    "fastapi>=0.135.1",
+    "uvicorn[standard]>=0.41.0",
+    "python-multipart>=0.0.22",
+    "accelerate>=1.13.0",
+    "datasets>=4.7.0",
+    "chromadb>=1.5.4",
+]
+[project.scripts]
+serve = "server:main"
+demo = "demo:main"
+[tool.uv]
+dev-dependencies = []

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+sentence-transformers>=5.2.3
+faiss-cpu>=1.13.2
+torch>=2.10.0
+numpy>=2.4.3
+scikit-learn>=1.8.0
+tqdm>=4.67.3
+gensim>=4.4.0
+fastapi>=0.135.1
+uvicorn[standard]>=0.41.0
+python-multipart>=0.0.22
+datasets>=4.7.0
+chromadb>=1.5.4