Besjon Cifliku commited on
Commit ·
db764ae
1
Parent(s): 9f009c2
feat: initial project setup
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +42 -0
- .gitignore +55 -0
- Dockerfile +59 -0
- HOWTO.md +390 -0
- README.md +193 -2
- contextual_similarity.py +850 -0
- data_loader.py +286 -0
- demo.py +233 -0
- docker-compose.yml +18 -0
- evaluation.py +547 -0
- frontend/.gitignore +24 -0
- frontend/README.md +16 -0
- frontend/eslint.config.js +29 -0
- frontend/index.html +12 -0
- frontend/package-lock.json +0 -0
- frontend/package.json +30 -0
- frontend/public/vite.svg +1 -0
- frontend/src/App.tsx +182 -0
- frontend/src/api.ts +144 -0
- frontend/src/assets/react.svg +1 -0
- frontend/src/components/BatchAnalysis.tsx +110 -0
- frontend/src/components/ContextAnalysis.tsx +116 -0
- frontend/src/components/DatasetPanel.tsx +246 -0
- frontend/src/components/EngineSetup.tsx +172 -0
- frontend/src/components/EvaluationDashboard.tsx +603 -0
- frontend/src/components/KeywordAnalysis.tsx +100 -0
- frontend/src/components/KeywordMatcher.tsx +90 -0
- frontend/src/components/LogViewer.tsx +71 -0
- frontend/src/components/MetricCard.tsx +16 -0
- frontend/src/components/ScoreBar.tsx +19 -0
- frontend/src/components/Select.tsx +60 -0
- frontend/src/components/SemanticSearch.tsx +70 -0
- frontend/src/components/SimilarWords.tsx +75 -0
- frontend/src/components/StatusMessage.tsx +13 -0
- frontend/src/components/Switch.tsx +22 -0
- frontend/src/components/TextCompare.tsx +84 -0
- frontend/src/components/Toggle.tsx +27 -0
- frontend/src/components/TrainingPanel.tsx +349 -0
- frontend/src/components/Word2VecPanel.tsx +293 -0
- frontend/src/hooks/useApiCall.ts +34 -0
- frontend/src/hooks/useCorpusLoader.ts +48 -0
- frontend/src/main.tsx +9 -0
- frontend/src/styles.css +828 -0
- frontend/src/types.ts +302 -0
- frontend/src/utils/colors.ts +6 -0
- frontend/src/vite-env.d.ts +1 -0
- frontend/tsconfig.json +21 -0
- frontend/vite.config.ts +15 -0
- pyproject.toml +27 -0
- requirements.txt +12 -0
.dockerignore
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated data & model artifacts
|
| 2 |
+
engine_state/
|
| 3 |
+
chroma_epstein/
|
| 4 |
+
checkpoints/
|
| 5 |
+
trained_model/
|
| 6 |
+
|
| 7 |
+
# Python
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
.venv/
|
| 11 |
+
venv/
|
| 12 |
+
*.egg-info/
|
| 13 |
+
|
| 14 |
+
# Node (frontend is built inside Docker)
|
| 15 |
+
frontend/node_modules/
|
| 16 |
+
frontend/dist/
|
| 17 |
+
|
| 18 |
+
# Git
|
| 19 |
+
.git/
|
| 20 |
+
.gitattributes
|
| 21 |
+
|
| 22 |
+
# OS & IDE
|
| 23 |
+
.DS_Store
|
| 24 |
+
.vscode/
|
| 25 |
+
.idea/
|
| 26 |
+
|
| 27 |
+
# HuggingFace cache
|
| 28 |
+
.cache/
|
| 29 |
+
|
| 30 |
+
# Docs (not needed in image)
|
| 31 |
+
HOWTO.md
|
| 32 |
+
README.md
|
| 33 |
+
|
| 34 |
+
# Docker (avoid recursive COPY)
|
| 35 |
+
Dockerfile
|
| 36 |
+
docker-compose.yml
|
| 37 |
+
.dockerignore
|
| 38 |
+
|
| 39 |
+
# Env & logs
|
| 40 |
+
.env
|
| 41 |
+
.env.local
|
| 42 |
+
*.log
|
.gitignore
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
*.egg-info/
|
| 7 |
+
*.egg
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
.venv/
|
| 11 |
+
venv/
|
| 12 |
+
.Python
|
| 13 |
+
|
| 14 |
+
# Node / Frontend
|
| 15 |
+
frontend/node_modules/
|
| 16 |
+
frontend/dist/
|
| 17 |
+
frontend/dist-ssr/
|
| 18 |
+
npm-debug.log*
|
| 19 |
+
yarn-debug.log*
|
| 20 |
+
pnpm-debug.log*
|
| 21 |
+
|
| 22 |
+
# Generated data & model artifacts
|
| 23 |
+
engine_state/
|
| 24 |
+
chroma_epstein/
|
| 25 |
+
checkpoints/
|
| 26 |
+
trained_model/
|
| 27 |
+
*.faiss
|
| 28 |
+
*.npy
|
| 29 |
+
*.pkl
|
| 30 |
+
*.pickle
|
| 31 |
+
|
| 32 |
+
# HuggingFace cache
|
| 33 |
+
.cache/
|
| 34 |
+
|
| 35 |
+
# OS
|
| 36 |
+
.DS_Store
|
| 37 |
+
Thumbs.db
|
| 38 |
+
|
| 39 |
+
# IDEs
|
| 40 |
+
.vscode/
|
| 41 |
+
.idea/
|
| 42 |
+
*.swp
|
| 43 |
+
*.swo
|
| 44 |
+
*.suo
|
| 45 |
+
*.ntvs*
|
| 46 |
+
*.njsproj
|
| 47 |
+
*.sln
|
| 48 |
+
|
| 49 |
+
# Environment
|
| 50 |
+
.env
|
| 51 |
+
.env.local
|
| 52 |
+
.env.*.local
|
| 53 |
+
|
| 54 |
+
# Logs
|
| 55 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================
|
| 2 |
+
# Multi-stage Docker build for Contextual Similarity Engine
|
| 3 |
+
# Single container: React frontend + FastAPI backend
|
| 4 |
+
# Deploys to: HuggingFace Spaces (Docker SDK), local, Railway
|
| 5 |
+
# =============================================================
|
| 6 |
+
|
| 7 |
+
# Stage 1: Build frontend
|
| 8 |
+
FROM node:22-slim AS frontend-build
|
| 9 |
+
WORKDIR /app/frontend
|
| 10 |
+
COPY frontend/package.json frontend/package-lock.json ./
|
| 11 |
+
RUN npm ci
|
| 12 |
+
COPY frontend/ ./
|
| 13 |
+
RUN npm run build
|
| 14 |
+
|
| 15 |
+
# Stage 2: Python runtime
|
| 16 |
+
FROM python:3.12-slim AS runtime
|
| 17 |
+
|
| 18 |
+
# Create non-root user (required by HF Spaces)
|
| 19 |
+
RUN useradd -m -u 1000 appuser
|
| 20 |
+
WORKDIR /app
|
| 21 |
+
|
| 22 |
+
# System deps for faiss-cpu and torch
|
| 23 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 24 |
+
build-essential \
|
| 25 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
+
|
| 27 |
+
# Install uv for fast dependency resolution
|
| 28 |
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
| 29 |
+
|
| 30 |
+
# Copy dependency files first (cache layer)
|
| 31 |
+
COPY --chown=appuser pyproject.toml uv.lock ./
|
| 32 |
+
|
| 33 |
+
# Install Python dependencies
|
| 34 |
+
RUN uv sync --frozen --no-dev
|
| 35 |
+
|
| 36 |
+
# Copy backend source
|
| 37 |
+
COPY --chown=appuser *.py ./
|
| 38 |
+
|
| 39 |
+
# Copy pre-built frontend
|
| 40 |
+
COPY --chown=appuser --from=frontend-build /app/frontend/dist ./frontend/dist
|
| 41 |
+
|
| 42 |
+
# Data directories (HF cache, engine state, trained models)
|
| 43 |
+
RUN mkdir -p /data/huggingface /data/engine_state /data/trained_model \
|
| 44 |
+
&& chown -R appuser:appuser /app /data
|
| 45 |
+
|
| 46 |
+
ENV HF_HOME=/data/huggingface
|
| 47 |
+
ENV TRANSFORMERS_CACHE=/data/huggingface
|
| 48 |
+
ENV ENGINE_STATE_DIR=/data/engine_state
|
| 49 |
+
|
| 50 |
+
# Switch to non-root user
|
| 51 |
+
USER appuser
|
| 52 |
+
|
| 53 |
+
# Expose port (HF Spaces expects 7860, override via PORT env)
|
| 54 |
+
EXPOSE 7860
|
| 55 |
+
|
| 56 |
+
# Run the server — HOST and PORT configurable via env
|
| 57 |
+
ENV HOST=0.0.0.0
|
| 58 |
+
ENV PORT=7860
|
| 59 |
+
CMD ["uv", "run", "python", "server.py"]
|
HOWTO.md
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contextual Similarity Engine — HOWTO
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
This project uses **transformer-based sentence embeddings** to find and compare
|
| 6 |
+
contextual meanings of keywords within large documents. Unlike Word2Vec (static,
|
| 7 |
+
one-vector-per-word), this system **fine-tunes on YOUR corpus** so it learns
|
| 8 |
+
domain-specific patterns — e.g. that "pizza" means "school" in your data.
|
| 9 |
+
|
| 10 |
+
A **Word2Vec (gensim) baseline** is included for comparison, demonstrating why
|
| 11 |
+
contextual embeddings are superior for meaning disambiguation.
|
| 12 |
+
|
| 13 |
+
**The pipeline is: TRAIN → INDEX → ANALYZE → EVALUATE.**
|
| 14 |
+
|
| 15 |
+
**Stack:**
|
| 16 |
+
- **SentenceTransformers** — contextual embeddings (PyTorch)
|
| 17 |
+
- **FAISS** — fast vector similarity search
|
| 18 |
+
- **gensim Word2Vec** — static embedding baseline for comparison
|
| 19 |
+
- **FastAPI** — REST API backend
|
| 20 |
+
- **React + TypeScript** — visualization frontend
|
| 21 |
+
- **scikit-learn** — clustering & evaluation metrics
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## 1. Install Dependencies
|
| 26 |
+
|
| 27 |
+
### Python backend (uv — recommended)
|
| 28 |
+
|
| 29 |
+
[uv](https://docs.astral.sh/uv/) is a fast Python package manager that replaces
|
| 30 |
+
`pip`, `venv`, and `requirements.txt` with a single tool and lockfile.
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
# Install uv (if not already installed)
|
| 34 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 35 |
+
|
| 36 |
+
# Create a virtual environment and install all dependencies from pyproject.toml
|
| 37 |
+
cd esfiles
|
| 38 |
+
uv sync
|
| 39 |
+
|
| 40 |
+
# Run commands inside the managed environment
|
| 41 |
+
uv run python server.py
|
| 42 |
+
uv run python demo.py
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
`uv sync` reads `pyproject.toml`, resolves dependencies, creates a `.venv`,
|
| 46 |
+
and generates a `uv.lock` lockfile for reproducible installs. The lockfile
|
| 47 |
+
pins exact versions so every machine gets identical dependencies.
|
| 48 |
+
|
| 49 |
+
**Adding/removing packages:**
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
uv add httpx # add a new dependency
|
| 53 |
+
uv remove httpx # remove it
|
| 54 |
+
uv lock --upgrade # upgrade all packages to latest compatible versions
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
### Python backend (pip — alternative)
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
python3 -m venv venv
|
| 61 |
+
source venv/bin/activate
|
| 62 |
+
pip install -r requirements.txt
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### React frontend
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
cd frontend
|
| 69 |
+
npm install
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## 2. Quick Start
|
| 75 |
+
|
| 76 |
+
### CLI demo (Word2Vec vs Transformer comparison)
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
uv run python demo.py
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
This runs side-by-side comparison:
|
| 83 |
+
1. Builds both Transformer and Word2Vec engines on the same corpus
|
| 84 |
+
2. Compares text similarity scores between approaches
|
| 85 |
+
3. Shows word-level similarity (Word2Vec only — transformers don't do single words)
|
| 86 |
+
4. Runs semantic search with both engines
|
| 87 |
+
5. Tests keyword meaning matching ("pizza" → food or school?)
|
| 88 |
+
6. Demonstrates clustering (transformer can separate meanings, Word2Vec cannot)
|
| 89 |
+
|
| 90 |
+
### Web UI
|
| 91 |
+
|
| 92 |
+
```bash
|
| 93 |
+
# Terminal 1: start the API server
|
| 94 |
+
uv run python server.py
|
| 95 |
+
|
| 96 |
+
# Terminal 2: start the React dev server
|
| 97 |
+
cd frontend && npm run dev
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
- API docs: `http://localhost:8000/docs`
|
| 101 |
+
- Frontend: `http://localhost:5173`
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
## 3. Training Your Model
|
| 106 |
+
|
| 107 |
+
Three strategies, from simplest to most powerful:
|
| 108 |
+
|
| 109 |
+
### Strategy 1: Unsupervised (TSDAE)
|
| 110 |
+
|
| 111 |
+
No labels needed. Learns your corpus vocabulary and phrasing via denoising autoencoder.
|
| 112 |
+
|
| 113 |
+
```python
|
| 114 |
+
from training import CorpusTrainer
|
| 115 |
+
|
| 116 |
+
corpus_texts = [open(f).read() for f in your_files]
|
| 117 |
+
trainer = CorpusTrainer(corpus_texts, base_model="all-MiniLM-L6-v2")
|
| 118 |
+
|
| 119 |
+
result = trainer.train_unsupervised(
|
| 120 |
+
output_path="./trained_model",
|
| 121 |
+
epochs=3,
|
| 122 |
+
batch_size=16,
|
| 123 |
+
)
|
| 124 |
+
print(f"Trained on {result['training_pairs']} sentences in {result['seconds']}s")
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
### Strategy 2: Contrastive (auto-mined pairs)
|
| 128 |
+
|
| 129 |
+
Adjacent sentences = similar, random sentences = dissimilar. Learns document structure
|
| 130 |
+
using MultipleNegativesRankingLoss with in-batch negatives.
|
| 131 |
+
|
| 132 |
+
```python
|
| 133 |
+
trainer = CorpusTrainer(corpus_texts)
|
| 134 |
+
|
| 135 |
+
result = trainer.train_contrastive(
|
| 136 |
+
output_path="./trained_model",
|
| 137 |
+
epochs=5,
|
| 138 |
+
batch_size=16,
|
| 139 |
+
)
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### Strategy 3: Keyword-supervised (best if you know the code words)
|
| 143 |
+
|
| 144 |
+
You provide a keyword→meaning map. The trainer auto-generates training pairs:
|
| 145 |
+
keyword-in-context ↔ meaning-substituted version, plus contrastive pairs from
|
| 146 |
+
corpus structure.
|
| 147 |
+
|
| 148 |
+
```python
|
| 149 |
+
trainer = CorpusTrainer(corpus_texts)
|
| 150 |
+
|
| 151 |
+
result = trainer.train_with_keywords(
|
| 152 |
+
keyword_meanings={"pizza": "school", "pepperoni": "math class"},
|
| 153 |
+
output_path="./trained_model",
|
| 154 |
+
epochs=5,
|
| 155 |
+
batch_size=16,
|
| 156 |
+
)
|
| 157 |
+
print(f"Keywords: {result['keywords']}")
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
### Verifying training worked
|
| 161 |
+
|
| 162 |
+
```python
|
| 163 |
+
# Compare base model vs trained model on test pairs
|
| 164 |
+
comparison = trainer.evaluate_model(
|
| 165 |
+
test_pairs=[
|
| 166 |
+
("pizza gives me homework", "school gives me homework", 0.95),
|
| 167 |
+
("pizza gives me homework", "I ate delicious pizza", 0.1),
|
| 168 |
+
("The pizza test is hard", "The school exam is difficult", 0.9),
|
| 169 |
+
],
|
| 170 |
+
trained_model_path="./trained_model",
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
print(f"Base error: {comparison['summary']['avg_base_error']:.4f}")
|
| 174 |
+
print(f"Trained error: {comparison['summary']['avg_trained_error']:.4f}")
|
| 175 |
+
print(f"Reduction: {comparison['summary']['error_reduction_pct']:.1f}%")
|
| 176 |
+
print(f"Improved: {comparison['summary']['improved']}/{comparison['summary']['total']}")
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
## 4. Using Your Trained Model
|
| 182 |
+
|
| 183 |
+
After training, use the saved model path instead of the pretrained model name:
|
| 184 |
+
|
| 185 |
+
```python
|
| 186 |
+
from contextual_similarity import ContextualSimilarityEngine
|
| 187 |
+
|
| 188 |
+
engine = ContextualSimilarityEngine(model_name="./trained_model")
|
| 189 |
+
|
| 190 |
+
engine.add_document("doc1", open("doc1.txt").read())
|
| 191 |
+
engine.build_index()
|
| 192 |
+
|
| 193 |
+
# Queries now use your domain-trained embeddings
|
| 194 |
+
results = engine.query("pizza homework", top_k=10)
|
| 195 |
+
matches = engine.match_keyword_to_meaning("pizza", [
|
| 196 |
+
"Italian food, restaurant, cooking",
|
| 197 |
+
"School, education, homework and tests",
|
| 198 |
+
])
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## 5. Word2Vec Baseline Comparison
|
| 204 |
+
|
| 205 |
+
A gensim Word2Vec engine is included to demonstrate the difference between
|
| 206 |
+
static and contextual embeddings:
|
| 207 |
+
|
| 208 |
+
```python
|
| 209 |
+
from word2vec_baseline import Word2VecEngine
|
| 210 |
+
|
| 211 |
+
w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
|
| 212 |
+
for doc_id, text in docs.items():
|
| 213 |
+
w2v.add_document(doc_id, text)
|
| 214 |
+
w2v.build_index()
|
| 215 |
+
|
| 216 |
+
# Word-level: which words appear in similar contexts?
|
| 217 |
+
w2v.most_similar_words("pizza", top_k=5)
|
| 218 |
+
|
| 219 |
+
# Sentence-level: averaged word vectors (lossy)
|
| 220 |
+
w2v.compare_texts("pizza gives me homework", "school gives me homework")
|
| 221 |
+
|
| 222 |
+
# Search
|
| 223 |
+
w2v.query("a place where children learn", top_k=3)
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
**Key limitation:** Word2Vec gives ONE vector per word. "pizza" always has the
|
| 227 |
+
same embedding whether it means food or school. Transformers encode the full
|
| 228 |
+
surrounding context, so the same word gets different embeddings in different passages.
|
| 229 |
+
|
| 230 |
+
---
|
| 231 |
+
|
| 232 |
+
## 6. Using the Web UI
|
| 233 |
+
|
| 234 |
+
1. **Train Model** (start here):
|
| 235 |
+
- Paste your corpus (documents separated by blank lines)
|
| 236 |
+
- Choose strategy: Unsupervised, Contrastive, or Keyword-supervised
|
| 237 |
+
- For keyword strategy, provide a JSON keyword→meaning map
|
| 238 |
+
- Configure base model, epochs, batch size, output path
|
| 239 |
+
- Click "Start Training" — model trains and saves to disk
|
| 240 |
+
- Run "Compare Models" to evaluate base vs trained
|
| 241 |
+
|
| 242 |
+
2. **Setup:**
|
| 243 |
+
- Initialize engine with your trained model path (e.g. `./trained_model`)
|
| 244 |
+
- Add documents and build the FAISS index
|
| 245 |
+
|
| 246 |
+
3. **Semantic Search:** query the corpus with trained embeddings
|
| 247 |
+
4. **Compare Texts:** cosine similarity between any two texts
|
| 248 |
+
5. **Keyword Analysis:** auto-cluster keyword meanings across documents
|
| 249 |
+
6. **Keyword Matcher:** match keyword occurrences to candidate meanings
|
| 250 |
+
7. **Batch Analysis:** multi-keyword analysis with cross-similarity matrix
|
| 251 |
+
8. **Evaluation:** disambiguation accuracy, retrieval P@K/MRR, similarity histograms
|
| 252 |
+
|
| 253 |
+
---
|
| 254 |
+
|
| 255 |
+
## 7. API Endpoints
|
| 256 |
+
|
| 257 |
+
### Training
|
| 258 |
+
| Method | Endpoint | Description |
|
| 259 |
+
|--------|----------|-------------|
|
| 260 |
+
| POST | `/api/train/unsupervised` | TSDAE domain adaptation |
|
| 261 |
+
| POST | `/api/train/contrastive` | Contrastive with auto-mined pairs |
|
| 262 |
+
| POST | `/api/train/keywords` | Keyword-supervised training |
|
| 263 |
+
| POST | `/api/train/evaluate` | Compare base vs trained model |
|
| 264 |
+
|
| 265 |
+
### Engine
|
| 266 |
+
| Method | Endpoint | Description |
|
| 267 |
+
|--------|----------|-------------|
|
| 268 |
+
| POST | `/api/init` | Initialize engine with a model |
|
| 269 |
+
| POST | `/api/documents` | Add a document to the corpus |
|
| 270 |
+
| POST | `/api/documents/upload` | Upload a file as a document |
|
| 271 |
+
| POST | `/api/index/build` | Build FAISS index |
|
| 272 |
+
| POST | `/api/query` | Semantic search |
|
| 273 |
+
| POST | `/api/compare` | Compare two texts |
|
| 274 |
+
| POST | `/api/analyze/keyword` | Single keyword analysis |
|
| 275 |
+
| POST | `/api/analyze/batch` | Multi-keyword batch analysis |
|
| 276 |
+
| POST | `/api/match` | Match keyword to candidate meanings |
|
| 277 |
+
| GET | `/api/stats` | Corpus statistics |
|
| 278 |
+
|
| 279 |
+
### Evaluation
|
| 280 |
+
| Method | Endpoint | Description |
|
| 281 |
+
|--------|----------|-------------|
|
| 282 |
+
| POST | `/api/eval/disambiguation` | Disambiguation accuracy |
|
| 283 |
+
| POST | `/api/eval/retrieval` | Retrieval metrics (P@K, MRR, NDCG) |
|
| 284 |
+
| GET | `/api/eval/similarity-distribution` | Pairwise similarity histogram |
|
| 285 |
+
|
| 286 |
+
### Word2Vec Baseline
|
| 287 |
+
| Method | Endpoint | Description |
|
| 288 |
+
|--------|----------|-------------|
|
| 289 |
+
| POST | `/api/w2v/init` | Train Word2Vec on corpus |
|
| 290 |
+
| POST | `/api/w2v/compare` | Compare two texts (averaged word vectors) |
|
| 291 |
+
| POST | `/api/w2v/query` | Search corpus |
|
| 292 |
+
| POST | `/api/w2v/similar-words` | Find similar words |
|
| 293 |
+
|
| 294 |
+
---
|
| 295 |
+
|
| 296 |
+
## 8. Available Base Models
|
| 297 |
+
|
| 298 |
+
| Model | Dim | Size | Quality | Speed |
|
| 299 |
+
|-------|-----|------|---------|-------|
|
| 300 |
+
| `all-MiniLM-L6-v2` | 384 | ~80MB | Good | Fast |
|
| 301 |
+
| `all-mpnet-base-v2` | 768 | ~420MB | Best | Medium |
|
| 302 |
+
|
| 303 |
+
Start with `all-MiniLM-L6-v2` for fast iteration, upgrade to `all-mpnet-base-v2`
|
| 304 |
+
for production quality.
|
| 305 |
+
|
| 306 |
+
---
|
| 307 |
+
|
| 308 |
+
## 9. Evaluation Metrics
|
| 309 |
+
|
| 310 |
+
| Metric | What it measures |
|
| 311 |
+
|--------|-----------------|
|
| 312 |
+
| **Accuracy** | % of keyword occurrences correctly matched to their meaning |
|
| 313 |
+
| **Weighted F1** | Harmonic mean of precision/recall, weighted by class frequency |
|
| 314 |
+
| **MRR** | Mean Reciprocal Rank — how early the first relevant result appears |
|
| 315 |
+
| **P@K** | Precision at K — fraction of top-K results that are relevant |
|
| 316 |
+
| **NDCG@K** | Normalized Discounted Cumulative Gain — ranking quality metric |
|
| 317 |
+
|
| 318 |
+
---
|
| 319 |
+
|
| 320 |
+
## 10. Tuning Parameters
|
| 321 |
+
|
| 322 |
+
### Training
|
| 323 |
+
|
| 324 |
+
| Parameter | Default | Notes |
|
| 325 |
+
|-----------|---------|-------|
|
| 326 |
+
| `epochs` | 3-5 | More = better fit but risk overfitting |
|
| 327 |
+
| `batch_size` | 16 | Larger = faster, needs more memory. MNRL benefits from larger batches |
|
| 328 |
+
| `context_window` | 2 | (Keyword strategy) sentences around keyword to include as context |
|
| 329 |
+
|
| 330 |
+
### Engine
|
| 331 |
+
|
| 332 |
+
| Parameter | Default | Notes |
|
| 333 |
+
|-----------|---------|-------|
|
| 334 |
+
| `chunk_size` | 512 | Characters per chunk. Larger = more context per chunk |
|
| 335 |
+
| `chunk_overlap` | 128 | Overlap prevents losing context at chunk boundaries |
|
| 336 |
+
| `batch_size` | 64 | Encoding batch size for FAISS indexing |
|
| 337 |
+
|
| 338 |
+
---
|
| 339 |
+
|
| 340 |
+
## 11. Computational Resources
|
| 341 |
+
|
| 342 |
+
| Task | CPU | GPU (CUDA/MPS) | RAM |
|
| 343 |
+
|------|-----|----------------|-----|
|
| 344 |
+
| Training (small, <1K pairs) | OK | Faster (2-5x) | 4GB+ |
|
| 345 |
+
| Training (medium, 1K-10K pairs) | Slow | Recommended | 8GB+ |
|
| 346 |
+
| Training (large, 10K+ pairs) | Very slow | Required | 16GB+ |
|
| 347 |
+
| Indexing (1K chunks) | OK | Faster | 4GB+ |
|
| 348 |
+
| Querying | Fast | N/A | 2GB+ |
|
| 349 |
+
|
| 350 |
+
**Minimum:** MacBook with 8GB RAM can train small models on CPU.
|
| 351 |
+
**Recommended:** 16GB RAM + GPU (NVIDIA CUDA or Apple Silicon MPS).
|
| 352 |
+
|
| 353 |
+
---
|
| 354 |
+
|
| 355 |
+
## 12. Project Structure
|
| 356 |
+
|
| 357 |
+
```
|
| 358 |
+
esfiles/
|
| 359 |
+
├── pyproject.toml # Project config & dependencies (uv)
|
| 360 |
+
├── requirements.txt # Fallback for pip users
|
| 361 |
+
├── contextual_similarity.py # Core engine: chunking, embedding, FAISS, analysis
|
| 362 |
+
├── training.py # Training pipeline: 3 strategies + evaluation
|
| 363 |
+
├── evaluation.py # Evaluation pipeline: metrics, reports
|
| 364 |
+
├── word2vec_baseline.py # Gensim Word2Vec baseline for comparison
|
| 365 |
+
├── server.py # FastAPI REST API
|
| 366 |
+
├── demo.py # CLI demo: Word2Vec vs Transformer comparison
|
| 367 |
+
├── HOWTO.md # This file
|
| 368 |
+
└── frontend/ # React + TypeScript UI
|
| 369 |
+
├── package.json
|
| 370 |
+
├── tsconfig.json
|
| 371 |
+
├── vite.config.ts
|
| 372 |
+
├── index.html
|
| 373 |
+
└── src/
|
| 374 |
+
├── main.tsx
|
| 375 |
+
├── App.tsx
|
| 376 |
+
├── styles.css
|
| 377 |
+
├── types.ts
|
| 378 |
+
├── api.ts
|
| 379 |
+
└── components/
|
| 380 |
+
├── ScoreBar.tsx
|
| 381 |
+
├── StatusMessage.tsx
|
| 382 |
+
├── TrainingPanel.tsx
|
| 383 |
+
├── EngineSetup.tsx
|
| 384 |
+
├── SemanticSearch.tsx
|
| 385 |
+
├── TextCompare.tsx
|
| 386 |
+
├── KeywordAnalysis.tsx
|
| 387 |
+
├── KeywordMatcher.tsx
|
| 388 |
+
├── BatchAnalysis.tsx
|
| 389 |
+
└── EvaluationDashboard.tsx
|
| 390 |
+
```
|
README.md
CHANGED
|
@@ -1,12 +1,203 @@
|
|
| 1 |
---
|
| 2 |
title: Esfiles
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
license: apache-2.0
|
| 9 |
short_description: 'A prototype to analyze embeddings and word correlations '
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Esfiles
|
| 3 |
+
emoji: "\U0001F3E2"
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
license: apache-2.0
|
| 10 |
short_description: 'A prototype to analyze embeddings and word correlations '
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Esfiles — Contextual Similarity Engine
|
| 14 |
+
|
| 15 |
+
A tool for analyzing word meanings in context using **transformer-based embeddings**. Unlike traditional approaches (Word2Vec) that assign one static vector per word, this system **fine-tunes on your corpus** so the same word gets different embeddings depending on its surrounding context — e.g. detecting that "pizza" is used as code for "school" in a set of documents.
|
| 16 |
+
|
| 17 |
+
Includes a **Word2Vec baseline** for side-by-side comparison.
|
| 18 |
+
|
| 19 |
+
## Stack
|
| 20 |
+
|
| 21 |
+
| Layer | Technology |
|
| 22 |
+
|-------|-----------|
|
| 23 |
+
| Embeddings | SentenceTransformers (PyTorch) |
|
| 24 |
+
| Vector search | FAISS |
|
| 25 |
+
| Baseline | gensim Word2Vec |
|
| 26 |
+
| Backend | FastAPI (Python) |
|
| 27 |
+
| Frontend | React 19 + TypeScript + Vite |
|
| 28 |
+
| Evaluation | scikit-learn metrics |
|
| 29 |
+
| Deployment | Docker (HuggingFace Spaces, local, Railway) |
|
| 30 |
+
|
| 31 |
+
## Prerequisites
|
| 32 |
+
|
| 33 |
+
- **Python 3.11+**
|
| 34 |
+
- **Node.js 18+** (for frontend)
|
| 35 |
+
- [uv](https://docs.astral.sh/uv/) (recommended) or pip
|
| 36 |
+
|
| 37 |
+
## Setup
|
| 38 |
+
|
| 39 |
+
### 1. Clone the repo
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
git clone <repo-url>
|
| 43 |
+
cd esfiles
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### 2. Install Python dependencies
|
| 47 |
+
|
| 48 |
+
**With uv (recommended):**
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 52 |
+
uv sync
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
**With pip:**
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
python3 -m venv venv
|
| 59 |
+
source venv/bin/activate
|
| 60 |
+
pip install -r requirements.txt
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### 3. Install frontend dependencies
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
cd frontend
|
| 67 |
+
npm install
|
| 68 |
+
cd ..
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## Usage
|
| 72 |
+
|
| 73 |
+
### CLI demo
|
| 74 |
+
|
| 75 |
+
Run the Word2Vec vs Transformer comparison demo:
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
uv run python demo.py
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
This builds both engines on a sample corpus and compares similarity scores, semantic search, keyword matching, and clustering.
|
| 82 |
+
|
| 83 |
+
### Web UI (development)
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
# Terminal 1 — API server
|
| 87 |
+
uv run python server.py
|
| 88 |
+
|
| 89 |
+
# Terminal 2 — React dev server
|
| 90 |
+
cd frontend && npm run dev
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
- **API docs:** http://localhost:8000/docs
|
| 94 |
+
- **Frontend:** http://localhost:5173
|
| 95 |
+
|
| 96 |
+
### Docker
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
docker compose up --build
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
The app will be available at http://localhost:8000. The Docker build compiles the React frontend and bundles it with the FastAPI server in a single container.
|
| 103 |
+
|
| 104 |
+
## How it works
|
| 105 |
+
|
| 106 |
+
**Pipeline: TRAIN → INDEX → ANALYZE → EVALUATE**
|
| 107 |
+
|
| 108 |
+
1. **Train** — Fine-tune a pretrained sentence-transformer on your corpus using one of three strategies:
|
| 109 |
+
- **Unsupervised (TSDAE):** No labels needed. Learns vocabulary and phrasing via denoising autoencoder.
|
| 110 |
+
- **Contrastive:** Auto-mines training pairs from document structure (adjacent sentences = similar).
|
| 111 |
+
- **Keyword-supervised:** You provide a keyword→meaning map (e.g. `{"pizza": "school"}`). The trainer generates context-aware training pairs.
|
| 112 |
+
|
| 113 |
+
2. **Index** — Chunk your documents and encode them into a FAISS vector index using the fine-tuned model.
|
| 114 |
+
|
| 115 |
+
3. **Analyze** — Query the index with semantic search, compare texts, analyze keyword meanings across documents, or match keywords to candidate meanings.
|
| 116 |
+
|
| 117 |
+
4. **Evaluate** — Measure disambiguation accuracy, retrieval metrics (P@K, MRR, NDCG), and clustering quality (NMI).
|
| 118 |
+
|
| 119 |
+
## API endpoints
|
| 120 |
+
|
| 121 |
+
### Training
|
| 122 |
+
| Method | Endpoint | Description |
|
| 123 |
+
|--------|----------|-------------|
|
| 124 |
+
| POST | `/api/train/unsupervised` | TSDAE domain adaptation |
|
| 125 |
+
| POST | `/api/train/contrastive` | Contrastive with auto-mined pairs |
|
| 126 |
+
| POST | `/api/train/keywords` | Keyword-supervised training |
|
| 127 |
+
| POST | `/api/train/evaluate` | Compare base vs trained model |
|
| 128 |
+
|
| 129 |
+
### Engine
|
| 130 |
+
| Method | Endpoint | Description |
|
| 131 |
+
|--------|----------|-------------|
|
| 132 |
+
| POST | `/api/init` | Initialize engine with a model |
|
| 133 |
+
| POST | `/api/documents` | Add a document |
|
| 134 |
+
| POST | `/api/documents/upload` | Upload a file as a document |
|
| 135 |
+
| POST | `/api/index/build` | Build FAISS index |
|
| 136 |
+
| POST | `/api/query` | Semantic search |
|
| 137 |
+
| POST | `/api/compare` | Compare two texts |
|
| 138 |
+
| POST | `/api/analyze/keyword` | Single keyword analysis |
|
| 139 |
+
| POST | `/api/analyze/batch` | Multi-keyword batch analysis |
|
| 140 |
+
| POST | `/api/match` | Match keyword to candidate meanings |
|
| 141 |
+
| GET | `/api/stats` | Corpus statistics |
|
| 142 |
+
|
| 143 |
+
### Evaluation
|
| 144 |
+
| Method | Endpoint | Description |
|
| 145 |
+
|--------|----------|-------------|
|
| 146 |
+
| POST | `/api/eval/disambiguation` | Disambiguation accuracy |
|
| 147 |
+
| POST | `/api/eval/retrieval` | Retrieval metrics (P@K, MRR, NDCG) |
|
| 148 |
+
| GET | `/api/eval/similarity-distribution` | Pairwise similarity histogram |
|
| 149 |
+
|
| 150 |
+
### Word2Vec baseline
|
| 151 |
+
| Method | Endpoint | Description |
|
| 152 |
+
|--------|----------|-------------|
|
| 153 |
+
| POST | `/api/w2v/init` | Train Word2Vec on corpus |
|
| 154 |
+
| POST | `/api/w2v/compare` | Compare two texts |
|
| 155 |
+
| POST | `/api/w2v/query` | Search corpus |
|
| 156 |
+
| POST | `/api/w2v/similar-words` | Find similar words |
|
| 157 |
+
|
| 158 |
+
Full interactive docs available at `/docs` when the server is running.
|
| 159 |
+
|
| 160 |
+
## Project structure
|
| 161 |
+
|
| 162 |
+
```
|
| 163 |
+
esfiles/
|
| 164 |
+
├── pyproject.toml # Dependencies (uv)
|
| 165 |
+
├── requirements.txt # Fallback for pip
|
| 166 |
+
├── uv.lock # Lockfile for reproducible installs
|
| 167 |
+
├── contextual_similarity.py # Core engine: chunking, embedding, FAISS, analysis
|
| 168 |
+
├── training.py # Training pipeline: 3 strategies + evaluation
|
| 169 |
+
├── evaluation.py # Evaluation: metrics, reports
|
| 170 |
+
├── word2vec_baseline.py # gensim Word2Vec baseline
|
| 171 |
+
├── data_loader.py # Epstein Files dataset loader (HuggingFace + ChromaDB)
|
| 172 |
+
├── server.py # FastAPI REST API
|
| 173 |
+
├── demo.py # CLI demo: Word2Vec vs Transformer comparison
|
| 174 |
+
├── Dockerfile # Multi-stage build (Node + Python)
|
| 175 |
+
├── docker-compose.yml # Local Docker setup
|
| 176 |
+
├── HOWTO.md # In-depth usage guide
|
| 177 |
+
└── frontend/ # React + TypeScript UI
|
| 178 |
+
├── package.json
|
| 179 |
+
├── vite.config.ts
|
| 180 |
+
├── index.html
|
| 181 |
+
└── src/
|
| 182 |
+
├── App.tsx # Main app with tab navigation
|
| 183 |
+
├── api.ts # API client
|
| 184 |
+
├── types.ts # TypeScript types
|
| 185 |
+
└── components/ # UI components (training, search, evaluation, etc.)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
## Base models
|
| 189 |
+
|
| 190 |
+
| Model | Dimensions | Quality | Speed |
|
| 191 |
+
|-------|-----------|---------|-------|
|
| 192 |
+
| `all-MiniLM-L6-v2` | 384 | Good | Fast |
|
| 193 |
+
| `all-mpnet-base-v2` | 768 | Best | Medium |
|
| 194 |
+
|
| 195 |
+
Start with `all-MiniLM-L6-v2` for iteration, use `all-mpnet-base-v2` for production.
|
| 196 |
+
|
| 197 |
+
## Further reading
|
| 198 |
+
|
| 199 |
+
See [HOWTO.md](HOWTO.md) for detailed usage examples including Python API usage, training configuration, tuning parameters, and evaluation metrics.
|
| 200 |
+
|
| 201 |
+
## License
|
| 202 |
+
|
| 203 |
+
Apache 2.0
|
contextual_similarity.py
ADDED
|
@@ -0,0 +1,850 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Contextual Word Similarity Engine
|
| 3 |
+
|
| 4 |
+
Uses transformer-based sentence embeddings (SentenceTransformers) and FAISS
|
| 5 |
+
vector search to find and compare contextual meanings of keywords within
|
| 6 |
+
large documents. Unlike static embeddings (Word2Vec/GloVe), this captures
|
| 7 |
+
how word meaning changes based on surrounding context.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
engine = ContextualSimilarityEngine()
|
| 11 |
+
engine.add_document("my_doc", text)
|
| 12 |
+
engine.build_index()
|
| 13 |
+
results = engine.analyze_keyword("pizza", top_k=10)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import re
|
| 17 |
+
import logging
|
| 18 |
+
from dataclasses import dataclass, field
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Optional
|
| 21 |
+
|
| 22 |
+
import faiss
|
| 23 |
+
import numpy as np
|
| 24 |
+
from sentence_transformers import SentenceTransformer, util
|
| 25 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 26 |
+
from tqdm import tqdm
|
| 27 |
+
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class Chunk:
|
| 33 |
+
"""A passage of text from a document with metadata."""
|
| 34 |
+
text: str
|
| 35 |
+
doc_id: str
|
| 36 |
+
chunk_index: int
|
| 37 |
+
start_char: int
|
| 38 |
+
end_char: int
|
| 39 |
+
|
| 40 |
+
def __repr__(self):
|
| 41 |
+
preview = self.text[:80].replace("\n", " ")
|
| 42 |
+
return f"Chunk(doc={self.doc_id!r}, idx={self.chunk_index}, text={preview!r}...)"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class SimilarityResult:
|
| 47 |
+
"""A single similarity match."""
|
| 48 |
+
chunk: Chunk
|
| 49 |
+
score: float
|
| 50 |
+
rank: int
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class KeywordContext:
|
| 55 |
+
"""A keyword occurrence with its surrounding context and embedding."""
|
| 56 |
+
keyword: str
|
| 57 |
+
chunk: Chunk
|
| 58 |
+
highlight_positions: list = field(default_factory=list)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class KeywordAnalysis:
|
| 63 |
+
"""Full analysis of a keyword's contextual meanings across a corpus."""
|
| 64 |
+
keyword: str
|
| 65 |
+
total_occurrences: int
|
| 66 |
+
meaning_clusters: list = field(default_factory=list)
|
| 67 |
+
cross_keyword_similarities: dict = field(default_factory=dict)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class ContextualSimilarityEngine:
|
| 71 |
+
"""
|
| 72 |
+
Engine for contextual word similarity analysis using transformer embeddings.
|
| 73 |
+
|
| 74 |
+
Loads documents, chunks them into passages, embeds with a SentenceTransformer
|
| 75 |
+
model, indexes with FAISS, and provides methods to:
|
| 76 |
+
- Find all contextual usages of a keyword
|
| 77 |
+
- Cluster keyword usages into distinct meanings
|
| 78 |
+
- Compare keyword contexts across documents
|
| 79 |
+
- Find passages most similar to a query
|
| 80 |
+
- Batch-analyze multiple keywords
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
def __init__(
|
| 84 |
+
self,
|
| 85 |
+
model_name: str = "all-MiniLM-L6-v2",
|
| 86 |
+
chunk_size: int = 512,
|
| 87 |
+
chunk_overlap: int = 128,
|
| 88 |
+
device: Optional[str] = None,
|
| 89 |
+
batch_size: int = 64,
|
| 90 |
+
):
|
| 91 |
+
"""
|
| 92 |
+
Args:
|
| 93 |
+
model_name: HuggingFace SentenceTransformer model name.
|
| 94 |
+
- "all-MiniLM-L6-v2": fast, good quality (384-dim)
|
| 95 |
+
- "all-mpnet-base-v2": best quality general-purpose (768-dim)
|
| 96 |
+
- "BAAI/bge-large-en-v1.5": high accuracy, larger (1024-dim)
|
| 97 |
+
chunk_size: Max characters per chunk.
|
| 98 |
+
chunk_overlap: Overlap between consecutive chunks (preserves context at boundaries).
|
| 99 |
+
device: PyTorch device ("cpu", "cuda", "mps"). Auto-detected if None.
|
| 100 |
+
batch_size: Batch size for encoding (tune for your GPU memory).
|
| 101 |
+
"""
|
| 102 |
+
logger.info(f"Loading model: {model_name}")
|
| 103 |
+
self._model_name = model_name
|
| 104 |
+
self.model = SentenceTransformer(model_name, device=device)
|
| 105 |
+
self.chunk_size = chunk_size
|
| 106 |
+
self.chunk_overlap = chunk_overlap
|
| 107 |
+
self.batch_size = batch_size
|
| 108 |
+
self.embedding_dim = self.model.get_sentence_embedding_dimension()
|
| 109 |
+
|
| 110 |
+
# Storage
|
| 111 |
+
self.chunks: list[Chunk] = []
|
| 112 |
+
self.embeddings: Optional[np.ndarray] = None
|
| 113 |
+
self.index: Optional[faiss.IndexFlatIP] = None
|
| 114 |
+
self._doc_ids: set[str] = set()
|
| 115 |
+
|
| 116 |
+
# ------------------------------------------------------------------ #
|
| 117 |
+
# Document loading & chunking
|
| 118 |
+
# ------------------------------------------------------------------ #
|
| 119 |
+
|
| 120 |
+
def add_document(self, doc_id: str, text: str) -> list[Chunk]:
|
| 121 |
+
"""
|
| 122 |
+
Chunk a document and add it to the corpus.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
doc_id: Unique identifier for this document.
|
| 126 |
+
text: Full document text.
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
List of Chunk objects created from this document.
|
| 130 |
+
"""
|
| 131 |
+
if doc_id in self._doc_ids:
|
| 132 |
+
raise ValueError(f"Document '{doc_id}' already added. Use a unique doc_id.")
|
| 133 |
+
self._doc_ids.add(doc_id)
|
| 134 |
+
|
| 135 |
+
new_chunks = self._chunk_text(text, doc_id)
|
| 136 |
+
self.chunks.extend(new_chunks)
|
| 137 |
+
logger.info(f"Added document '{doc_id}': {len(new_chunks)} chunks")
|
| 138 |
+
|
| 139 |
+
# Invalidate index so user must rebuild
|
| 140 |
+
self.embeddings = None
|
| 141 |
+
self.index = None
|
| 142 |
+
|
| 143 |
+
return new_chunks
|
| 144 |
+
|
| 145 |
+
def add_document_from_file(self, file_path: str, doc_id: Optional[str] = None) -> list[Chunk]:
|
| 146 |
+
"""Load a text file and add it as a document."""
|
| 147 |
+
path = Path(file_path).resolve()
|
| 148 |
+
base_dir = Path(__file__).parent.resolve()
|
| 149 |
+
if not path.is_relative_to(base_dir):
|
| 150 |
+
raise ValueError("File path must be within the project directory.")
|
| 151 |
+
if not path.exists():
|
| 152 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 153 |
+
text = path.read_text(encoding="utf-8")
|
| 154 |
+
return self.add_document(doc_id or path.stem, text)
|
| 155 |
+
|
| 156 |
+
def _chunk_text(self, text: str, doc_id: str) -> list[Chunk]:
|
| 157 |
+
"""
|
| 158 |
+
Split text into overlapping chunks, breaking at sentence boundaries
|
| 159 |
+
when possible to preserve semantic coherence.
|
| 160 |
+
"""
|
| 161 |
+
# Normalize whitespace
|
| 162 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 163 |
+
|
| 164 |
+
chunks = []
|
| 165 |
+
start = 0
|
| 166 |
+
chunk_idx = 0
|
| 167 |
+
|
| 168 |
+
while start < len(text):
|
| 169 |
+
end = start + self.chunk_size
|
| 170 |
+
|
| 171 |
+
# If we're not at the end, try to break at a sentence boundary
|
| 172 |
+
if end < len(text):
|
| 173 |
+
# Look for sentence-ending punctuation near the chunk boundary
|
| 174 |
+
search_region = text[max(end - 100, start):end]
|
| 175 |
+
# Find last sentence break in the search region
|
| 176 |
+
for sep in [". ", ".\n", "! ", "!\n", "? ", "?\n", "\n\n"]:
|
| 177 |
+
last_break = search_region.rfind(sep)
|
| 178 |
+
if last_break != -1:
|
| 179 |
+
end = max(end - 100, start) + last_break + len(sep)
|
| 180 |
+
break
|
| 181 |
+
|
| 182 |
+
chunk_text = text[start:end].strip()
|
| 183 |
+
if chunk_text:
|
| 184 |
+
chunks.append(Chunk(
|
| 185 |
+
text=chunk_text,
|
| 186 |
+
doc_id=doc_id,
|
| 187 |
+
chunk_index=chunk_idx,
|
| 188 |
+
start_char=start,
|
| 189 |
+
end_char=end,
|
| 190 |
+
))
|
| 191 |
+
chunk_idx += 1
|
| 192 |
+
|
| 193 |
+
# Advance with overlap
|
| 194 |
+
start = end - self.chunk_overlap if end < len(text) else end
|
| 195 |
+
|
| 196 |
+
return chunks
|
| 197 |
+
|
| 198 |
+
# ------------------------------------------------------------------ #
|
| 199 |
+
# Embedding & indexing
|
| 200 |
+
# ------------------------------------------------------------------ #
|
| 201 |
+
|
| 202 |
+
def build_index(self, normalize: bool = True, show_progress: bool = True) -> None:
|
| 203 |
+
"""
|
| 204 |
+
Embed all chunks and build a FAISS index for fast similarity search.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
normalize: L2-normalize embeddings (enables cosine similarity via inner product).
|
| 208 |
+
show_progress: Show a progress bar during encoding.
|
| 209 |
+
"""
|
| 210 |
+
if not self.chunks:
|
| 211 |
+
raise RuntimeError("No documents loaded. Call add_document() first.")
|
| 212 |
+
|
| 213 |
+
logger.info(f"Encoding {len(self.chunks)} chunks...")
|
| 214 |
+
texts = [c.text for c in self.chunks]
|
| 215 |
+
|
| 216 |
+
self.embeddings = self.model.encode(
|
| 217 |
+
texts,
|
| 218 |
+
batch_size=self.batch_size,
|
| 219 |
+
show_progress_bar=show_progress,
|
| 220 |
+
convert_to_numpy=True,
|
| 221 |
+
normalize_embeddings=normalize,
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Build FAISS inner-product index (cosine similarity when vectors are normalized)
|
| 225 |
+
self.index = faiss.IndexFlatIP(self.embedding_dim)
|
| 226 |
+
self.index.add(self.embeddings.astype(np.float32))
|
| 227 |
+
|
| 228 |
+
logger.info(f"Index built: {self.index.ntotal} vectors, dim={self.embedding_dim}")
|
| 229 |
+
|
| 230 |
+
# ------------------------------------------------------------------ #
|
| 231 |
+
# Core query methods
|
| 232 |
+
# ------------------------------------------------------------------ #
|
| 233 |
+
|
| 234 |
+
def query(self, text: str, top_k: int = 10) -> list[SimilarityResult]:
|
| 235 |
+
"""
|
| 236 |
+
Find the most similar chunks to a query text.
|
| 237 |
+
|
| 238 |
+
Args:
|
| 239 |
+
text: Query string (sentence, phrase, or keyword in context).
|
| 240 |
+
top_k: Number of results to return.
|
| 241 |
+
|
| 242 |
+
Returns:
|
| 243 |
+
List of SimilarityResult sorted by descending similarity score.
|
| 244 |
+
"""
|
| 245 |
+
self._ensure_index()
|
| 246 |
+
|
| 247 |
+
query_vec = self.model.encode(
|
| 248 |
+
[text], normalize_embeddings=True, convert_to_numpy=True
|
| 249 |
+
).astype(np.float32)
|
| 250 |
+
|
| 251 |
+
scores, indices = self.index.search(query_vec, top_k)
|
| 252 |
+
|
| 253 |
+
results = []
|
| 254 |
+
for rank, (score, idx) in enumerate(zip(scores[0], indices[0])):
|
| 255 |
+
if idx == -1:
|
| 256 |
+
continue
|
| 257 |
+
results.append(SimilarityResult(
|
| 258 |
+
chunk=self.chunks[idx],
|
| 259 |
+
score=float(score),
|
| 260 |
+
rank=rank + 1,
|
| 261 |
+
))
|
| 262 |
+
return results
|
| 263 |
+
|
| 264 |
+
def compare_texts(self, text_a: str, text_b: str) -> float:
|
| 265 |
+
"""
|
| 266 |
+
Compute cosine similarity between two texts directly.
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
Similarity score in [-1, 1] (typically [0, 1] for natural language).
|
| 270 |
+
"""
|
| 271 |
+
vecs = self.model.encode(
|
| 272 |
+
[text_a, text_b], normalize_embeddings=True, convert_to_tensor=True
|
| 273 |
+
)
|
| 274 |
+
return float(util.pytorch_cos_sim(vecs[0], vecs[1]).item())
|
| 275 |
+
|
| 276 |
+
# ------------------------------------------------------------------ #
|
| 277 |
+
# Keyword analysis
|
| 278 |
+
# ------------------------------------------------------------------ #
|
| 279 |
+
|
| 280 |
+
def find_keyword_contexts(
|
| 281 |
+
self, keyword: str, case_sensitive: bool = False
|
| 282 |
+
) -> list[KeywordContext]:
|
| 283 |
+
"""
|
| 284 |
+
Find all chunks containing a keyword and return them as KeywordContext objects.
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
keyword: The word or phrase to search for.
|
| 288 |
+
case_sensitive: Whether matching is case-sensitive.
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
List of KeywordContext with chunk and highlight positions.
|
| 292 |
+
"""
|
| 293 |
+
if len(keyword) > 200:
|
| 294 |
+
raise ValueError("Keyword must be 200 characters or fewer.")
|
| 295 |
+
flags = 0 if case_sensitive else re.IGNORECASE
|
| 296 |
+
pattern = re.compile(r"\b" + re.escape(keyword) + r"\b", flags)
|
| 297 |
+
|
| 298 |
+
contexts = []
|
| 299 |
+
for chunk in self.chunks:
|
| 300 |
+
matches = list(pattern.finditer(chunk.text))
|
| 301 |
+
if matches:
|
| 302 |
+
positions = [(m.start(), m.end()) for m in matches]
|
| 303 |
+
contexts.append(KeywordContext(
|
| 304 |
+
keyword=keyword,
|
| 305 |
+
chunk=chunk,
|
| 306 |
+
highlight_positions=positions,
|
| 307 |
+
))
|
| 308 |
+
return contexts
|
| 309 |
+
|
| 310 |
+
def analyze_keyword(
|
| 311 |
+
self,
|
| 312 |
+
keyword: str,
|
| 313 |
+
top_k: int = 10,
|
| 314 |
+
cluster_threshold: float = 0.35,
|
| 315 |
+
case_sensitive: bool = False,
|
| 316 |
+
) -> KeywordAnalysis:
|
| 317 |
+
"""
|
| 318 |
+
Analyze all contextual usages of a keyword across the corpus.
|
| 319 |
+
|
| 320 |
+
Finds every chunk containing the keyword, embeds them, clusters them
|
| 321 |
+
by semantic similarity (agglomerative clustering), and returns a
|
| 322 |
+
structured analysis with distinct meaning groups.
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
keyword: Word or phrase to analyze.
|
| 326 |
+
top_k: Max similar chunks to return per meaning cluster.
|
| 327 |
+
cluster_threshold: Distance threshold for clustering (lower = more clusters).
|
| 328 |
+
0.35 works well for clearly distinct meanings; raise to 0.5+ to merge similar ones.
|
| 329 |
+
case_sensitive: Whether keyword matching is case-sensitive.
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
KeywordAnalysis with meaning clusters and similarity info.
|
| 333 |
+
"""
|
| 334 |
+
self._ensure_index()
|
| 335 |
+
contexts = self.find_keyword_contexts(keyword, case_sensitive)
|
| 336 |
+
|
| 337 |
+
if not contexts:
|
| 338 |
+
return KeywordAnalysis(keyword=keyword, total_occurrences=0)
|
| 339 |
+
|
| 340 |
+
# Get embeddings for keyword-containing chunks
|
| 341 |
+
chunk_indices = []
|
| 342 |
+
for ctx in contexts:
|
| 343 |
+
idx = self.chunks.index(ctx.chunk)
|
| 344 |
+
chunk_indices.append(idx)
|
| 345 |
+
|
| 346 |
+
kw_embeddings = self.embeddings[chunk_indices]
|
| 347 |
+
|
| 348 |
+
# Cluster the keyword contexts by semantic similarity
|
| 349 |
+
clusters = self._cluster_embeddings(kw_embeddings, threshold=cluster_threshold)
|
| 350 |
+
|
| 351 |
+
# Build meaning clusters
|
| 352 |
+
meaning_clusters = []
|
| 353 |
+
for cluster_id in sorted(set(clusters)):
|
| 354 |
+
member_indices = [i for i, c in enumerate(clusters) if c == cluster_id]
|
| 355 |
+
member_contexts = [contexts[i] for i in member_indices]
|
| 356 |
+
member_embeds = kw_embeddings[member_indices]
|
| 357 |
+
|
| 358 |
+
# Centroid of this cluster
|
| 359 |
+
centroid = member_embeds.mean(axis=0, keepdims=True).astype(np.float32)
|
| 360 |
+
faiss.normalize_L2(centroid)
|
| 361 |
+
|
| 362 |
+
# Find top_k most similar chunks in the full corpus to this meaning
|
| 363 |
+
scores, idx_arr = self.index.search(centroid, top_k)
|
| 364 |
+
similar = []
|
| 365 |
+
for rank, (score, idx) in enumerate(zip(scores[0], idx_arr[0])):
|
| 366 |
+
if idx == -1:
|
| 367 |
+
continue
|
| 368 |
+
similar.append(SimilarityResult(
|
| 369 |
+
chunk=self.chunks[idx],
|
| 370 |
+
score=float(score),
|
| 371 |
+
rank=rank + 1,
|
| 372 |
+
))
|
| 373 |
+
|
| 374 |
+
meaning_clusters.append({
|
| 375 |
+
"cluster_id": cluster_id,
|
| 376 |
+
"size": len(member_indices),
|
| 377 |
+
"representative_text": member_contexts[0].chunk.text[:200],
|
| 378 |
+
"contexts": member_contexts,
|
| 379 |
+
"similar_passages": similar,
|
| 380 |
+
})
|
| 381 |
+
|
| 382 |
+
return KeywordAnalysis(
|
| 383 |
+
keyword=keyword,
|
| 384 |
+
total_occurrences=len(contexts),
|
| 385 |
+
meaning_clusters=meaning_clusters,
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
def batch_analyze_keywords(
|
| 389 |
+
self,
|
| 390 |
+
keywords: list[str],
|
| 391 |
+
top_k: int = 10,
|
| 392 |
+
cluster_threshold: float = 0.35,
|
| 393 |
+
compare_across: bool = True,
|
| 394 |
+
) -> dict[str, KeywordAnalysis]:
|
| 395 |
+
"""
|
| 396 |
+
Analyze multiple keywords and optionally compute cross-keyword similarities.
|
| 397 |
+
|
| 398 |
+
Args:
|
| 399 |
+
keywords: List of keywords to analyze.
|
| 400 |
+
top_k: Results per cluster.
|
| 401 |
+
cluster_threshold: Clustering distance threshold.
|
| 402 |
+
compare_across: If True, compute pairwise similarity between keyword contexts.
|
| 403 |
+
|
| 404 |
+
Returns:
|
| 405 |
+
Dict mapping keyword -> KeywordAnalysis.
|
| 406 |
+
"""
|
| 407 |
+
results = {}
|
| 408 |
+
for kw in tqdm(keywords, desc="Analyzing keywords"):
|
| 409 |
+
results[kw] = self.analyze_keyword(kw, top_k, cluster_threshold)
|
| 410 |
+
|
| 411 |
+
if compare_across and len(keywords) > 1:
|
| 412 |
+
self._compute_cross_keyword_similarities(results)
|
| 413 |
+
|
| 414 |
+
return results
|
| 415 |
+
|
| 416 |
+
def _compute_cross_keyword_similarities(
|
| 417 |
+
self, analyses: dict[str, KeywordAnalysis]
|
| 418 |
+
) -> None:
|
| 419 |
+
"""Compute average cosine similarity between each pair of keywords' contexts."""
|
| 420 |
+
keyword_centroids = {}
|
| 421 |
+
for kw, analysis in analyses.items():
|
| 422 |
+
if not analysis.meaning_clusters:
|
| 423 |
+
continue
|
| 424 |
+
# Collect all context embeddings for this keyword
|
| 425 |
+
all_indices = []
|
| 426 |
+
for cluster in analysis.meaning_clusters:
|
| 427 |
+
for ctx in cluster["contexts"]:
|
| 428 |
+
idx = self.chunks.index(ctx.chunk)
|
| 429 |
+
all_indices.append(idx)
|
| 430 |
+
if all_indices:
|
| 431 |
+
embeds = self.embeddings[all_indices]
|
| 432 |
+
centroid = embeds.mean(axis=0)
|
| 433 |
+
norm = np.linalg.norm(centroid)
|
| 434 |
+
if norm > 0:
|
| 435 |
+
centroid = centroid / norm
|
| 436 |
+
keyword_centroids[kw] = centroid
|
| 437 |
+
|
| 438 |
+
# Pairwise similarities
|
| 439 |
+
kw_list = list(keyword_centroids.keys())
|
| 440 |
+
for i, kw_a in enumerate(kw_list):
|
| 441 |
+
sims = {}
|
| 442 |
+
for j, kw_b in enumerate(kw_list):
|
| 443 |
+
if i != j:
|
| 444 |
+
score = float(np.dot(keyword_centroids[kw_a], keyword_centroids[kw_b]))
|
| 445 |
+
sims[kw_b] = score
|
| 446 |
+
if kw_a in analyses:
|
| 447 |
+
analyses[kw_a].cross_keyword_similarities = sims
|
| 448 |
+
|
| 449 |
+
# ------------------------------------------------------------------ #
|
| 450 |
+
# Contextual keyword matching (the core use case)
|
| 451 |
+
# ------------------------------------------------------------------ #
|
| 452 |
+
|
| 453 |
+
def match_keyword_to_meaning(
|
| 454 |
+
self,
|
| 455 |
+
keyword: str,
|
| 456 |
+
candidate_meanings: list[str],
|
| 457 |
+
) -> list[dict]:
|
| 458 |
+
"""
|
| 459 |
+
Given a keyword and a list of candidate meanings (words/phrases),
|
| 460 |
+
find which meaning each occurrence of the keyword is closest to.
|
| 461 |
+
|
| 462 |
+
This is the core "pizza means school" use case: you provide the keyword
|
| 463 |
+
"pizza" and candidates ["pizza (food)", "school", "homework"], and this
|
| 464 |
+
method tells you which meaning each usage of "pizza" maps to.
|
| 465 |
+
|
| 466 |
+
Args:
|
| 467 |
+
keyword: The keyword to analyze (e.g. "pizza").
|
| 468 |
+
candidate_meanings: List of meaning descriptions (e.g. ["food", "school"]).
|
| 469 |
+
|
| 470 |
+
Returns:
|
| 471 |
+
List of dicts with keys: chunk, best_match, scores (all candidates).
|
| 472 |
+
"""
|
| 473 |
+
self._ensure_index()
|
| 474 |
+
|
| 475 |
+
contexts = self.find_keyword_contexts(keyword)
|
| 476 |
+
if not contexts:
|
| 477 |
+
return []
|
| 478 |
+
|
| 479 |
+
# Embed all candidate meanings
|
| 480 |
+
candidate_vecs = self.model.encode(
|
| 481 |
+
candidate_meanings, normalize_embeddings=True, convert_to_tensor=True
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
results = []
|
| 485 |
+
for ctx in contexts:
|
| 486 |
+
# Embed the chunk containing the keyword
|
| 487 |
+
chunk_vec = self.model.encode(
|
| 488 |
+
[ctx.chunk.text], normalize_embeddings=True, convert_to_tensor=True
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
# Score against each candidate
|
| 492 |
+
scores = util.pytorch_cos_sim(chunk_vec, candidate_vecs)[0]
|
| 493 |
+
score_dict = {
|
| 494 |
+
meaning: float(scores[i]) for i, meaning in enumerate(candidate_meanings)
|
| 495 |
+
}
|
| 496 |
+
best = max(score_dict, key=score_dict.get)
|
| 497 |
+
|
| 498 |
+
results.append({
|
| 499 |
+
"chunk": ctx.chunk,
|
| 500 |
+
"best_match": best,
|
| 501 |
+
"best_score": score_dict[best],
|
| 502 |
+
"all_scores": score_dict,
|
| 503 |
+
})
|
| 504 |
+
|
| 505 |
+
return results
|
| 506 |
+
|
| 507 |
+
# ------------------------------------------------------------------ #
|
| 508 |
+
# Context inference (keyword → meaning words)
|
| 509 |
+
# ------------------------------------------------------------------ #
|
| 510 |
+
|
| 511 |
+
# Common English stopwords to exclude from context word extraction
|
| 512 |
+
_STOPWORDS = frozenset(
|
| 513 |
+
"a an the and or but in on at to for of is it that this was were be been "
|
| 514 |
+
"being have has had do does did will would shall should may might can could "
|
| 515 |
+
"not no nor so if then than too very just about above after again all also "
|
| 516 |
+
"am are as between both by each few from further get got he her here hers "
|
| 517 |
+
"herself him himself his how i its itself me more most my myself no nor "
|
| 518 |
+
"only other our ours ourselves out over own same she some such their theirs "
|
| 519 |
+
"them themselves there these they those through under until up us we what "
|
| 520 |
+
"when where which while who whom why with you your yours yourself yourselves "
|
| 521 |
+
"one two three four five six seven eight nine ten into been being because "
|
| 522 |
+
"during before between against without within along across behind since "
|
| 523 |
+
"upon around among".split()
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
def infer_keyword_meanings(
|
| 527 |
+
self,
|
| 528 |
+
keyword: str,
|
| 529 |
+
context_window: int = 120,
|
| 530 |
+
top_words: int = 8,
|
| 531 |
+
cluster_threshold: float = 0.35,
|
| 532 |
+
max_meanings: int = 10,
|
| 533 |
+
) -> dict:
|
| 534 |
+
"""
|
| 535 |
+
Infer what a keyword likely means based on its surrounding context words.
|
| 536 |
+
|
| 537 |
+
Finds all occurrences, clusters them by semantic similarity, then extracts
|
| 538 |
+
the most distinctive co-occurring words for each meaning cluster.
|
| 539 |
+
|
| 540 |
+
Args:
|
| 541 |
+
keyword: The keyword to analyze.
|
| 542 |
+
context_window: Characters around each keyword occurrence to examine.
|
| 543 |
+
top_words: Number of associated words to return per meaning.
|
| 544 |
+
cluster_threshold: Distance threshold for clustering.
|
| 545 |
+
max_meanings: Maximum number of meaning clusters to return.
|
| 546 |
+
|
| 547 |
+
Returns:
|
| 548 |
+
Dict with keyword, total_occurrences, and meanings list.
|
| 549 |
+
"""
|
| 550 |
+
self._ensure_index()
|
| 551 |
+
contexts = self.find_keyword_contexts(keyword)
|
| 552 |
+
|
| 553 |
+
if not contexts:
|
| 554 |
+
return {
|
| 555 |
+
"keyword": keyword,
|
| 556 |
+
"total_occurrences": 0,
|
| 557 |
+
"meanings": [],
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
# Get embeddings and cluster
|
| 561 |
+
chunk_indices = [self.chunks.index(ctx.chunk) for ctx in contexts]
|
| 562 |
+
kw_embeddings = self.embeddings[chunk_indices]
|
| 563 |
+
clusters = self._cluster_embeddings(kw_embeddings, threshold=cluster_threshold)
|
| 564 |
+
|
| 565 |
+
total = len(contexts)
|
| 566 |
+
kw_lower = keyword.lower()
|
| 567 |
+
word_pattern = re.compile(r"[a-zA-Z]{3,}")
|
| 568 |
+
|
| 569 |
+
# Global word frequencies (across all occurrences) for TF-IDF-like scoring
|
| 570 |
+
global_word_counts: dict[str, int] = {}
|
| 571 |
+
cluster_data: dict[int, list[dict[str, int]]] = {}
|
| 572 |
+
|
| 573 |
+
for i, ctx in enumerate(contexts):
|
| 574 |
+
cluster_id = clusters[i]
|
| 575 |
+
if cluster_id not in cluster_data:
|
| 576 |
+
cluster_data[cluster_id] = []
|
| 577 |
+
|
| 578 |
+
# Extract context window around each keyword occurrence
|
| 579 |
+
local_counts: dict[str, int] = {}
|
| 580 |
+
for start, end in ctx.highlight_positions:
|
| 581 |
+
window_start = max(0, start - context_window)
|
| 582 |
+
window_end = min(len(ctx.chunk.text), end + context_window)
|
| 583 |
+
window_text = ctx.chunk.text[window_start:window_end].lower()
|
| 584 |
+
|
| 585 |
+
for word_match in word_pattern.finditer(window_text):
|
| 586 |
+
w = word_match.group()
|
| 587 |
+
if w == kw_lower or w in self._STOPWORDS or len(w) < 3:
|
| 588 |
+
continue
|
| 589 |
+
local_counts[w] = local_counts.get(w, 0) + 1
|
| 590 |
+
global_word_counts[w] = global_word_counts.get(w, 0) + 1
|
| 591 |
+
|
| 592 |
+
cluster_data[cluster_id].append(local_counts)
|
| 593 |
+
|
| 594 |
+
# Build meanings from clusters
|
| 595 |
+
meanings = []
|
| 596 |
+
for cluster_id in sorted(cluster_data.keys()):
|
| 597 |
+
members = cluster_data[cluster_id]
|
| 598 |
+
count = len(members)
|
| 599 |
+
confidence = round(count / total, 3)
|
| 600 |
+
|
| 601 |
+
# Aggregate word counts for this cluster
|
| 602 |
+
cluster_word_counts: dict[str, int] = {}
|
| 603 |
+
for member_counts in members:
|
| 604 |
+
for w, c in member_counts.items():
|
| 605 |
+
cluster_word_counts[w] = cluster_word_counts.get(w, 0) + c
|
| 606 |
+
|
| 607 |
+
# Score words: cluster frequency weighted by distinctiveness
|
| 608 |
+
# (how much more frequent in this cluster vs globally)
|
| 609 |
+
num_clusters = len(cluster_data)
|
| 610 |
+
word_scores: dict[str, float] = {}
|
| 611 |
+
for w, cluster_count in cluster_word_counts.items():
|
| 612 |
+
global_count = global_word_counts.get(w, 1)
|
| 613 |
+
# TF in cluster * IDF-like distinctiveness
|
| 614 |
+
tf = cluster_count / max(sum(cluster_word_counts.values()), 1)
|
| 615 |
+
distinctiveness = (cluster_count / global_count) if num_clusters > 1 else 1.0
|
| 616 |
+
word_scores[w] = tf * (0.5 + 0.5 * distinctiveness)
|
| 617 |
+
|
| 618 |
+
# Get top words
|
| 619 |
+
sorted_words = sorted(word_scores.items(), key=lambda x: -x[1])[:top_words]
|
| 620 |
+
associated_words = [
|
| 621 |
+
{"word": w, "score": round(s, 4)} for w, s in sorted_words
|
| 622 |
+
]
|
| 623 |
+
|
| 624 |
+
# Get example context snippets
|
| 625 |
+
example_contexts = []
|
| 626 |
+
member_indices = [j for j, c in enumerate(clusters) if c == cluster_id]
|
| 627 |
+
for j in member_indices[:3]: # max 3 examples
|
| 628 |
+
ctx = contexts[j]
|
| 629 |
+
if ctx.highlight_positions:
|
| 630 |
+
start, end = ctx.highlight_positions[0]
|
| 631 |
+
snippet_start = max(0, start - 80)
|
| 632 |
+
snippet_end = min(len(ctx.chunk.text), end + 80)
|
| 633 |
+
snippet = ctx.chunk.text[snippet_start:snippet_end].strip()
|
| 634 |
+
if snippet_start > 0:
|
| 635 |
+
snippet = "..." + snippet
|
| 636 |
+
if snippet_end < len(ctx.chunk.text):
|
| 637 |
+
snippet = snippet + "..."
|
| 638 |
+
example_contexts.append({
|
| 639 |
+
"doc_id": ctx.chunk.doc_id,
|
| 640 |
+
"snippet": snippet,
|
| 641 |
+
})
|
| 642 |
+
|
| 643 |
+
meanings.append({
|
| 644 |
+
"cluster_id": cluster_id,
|
| 645 |
+
"occurrences": count,
|
| 646 |
+
"confidence": confidence,
|
| 647 |
+
"associated_words": associated_words,
|
| 648 |
+
"example_contexts": example_contexts,
|
| 649 |
+
})
|
| 650 |
+
|
| 651 |
+
# Sort by confidence descending
|
| 652 |
+
meanings.sort(key=lambda m: -m["confidence"])
|
| 653 |
+
meanings = meanings[:max_meanings]
|
| 654 |
+
|
| 655 |
+
return {
|
| 656 |
+
"keyword": keyword,
|
| 657 |
+
"total_occurrences": total,
|
| 658 |
+
"meanings": meanings,
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
# ------------------------------------------------------------------ #
|
| 662 |
+
# Utilities
|
| 663 |
+
# ------------------------------------------------------------------ #
|
| 664 |
+
|
| 665 |
+
def _cluster_embeddings(
|
| 666 |
+
self, embeddings: np.ndarray, threshold: float = 0.35
|
| 667 |
+
) -> list[int]:
|
| 668 |
+
"""Cluster embeddings using agglomerative clustering with cosine distance."""
|
| 669 |
+
if len(embeddings) == 1:
|
| 670 |
+
return [0]
|
| 671 |
+
|
| 672 |
+
clustering = AgglomerativeClustering(
|
| 673 |
+
n_clusters=None,
|
| 674 |
+
distance_threshold=threshold,
|
| 675 |
+
metric="cosine",
|
| 676 |
+
linkage="average",
|
| 677 |
+
)
|
| 678 |
+
labels = clustering.fit_predict(embeddings)
|
| 679 |
+
return labels.tolist()
|
| 680 |
+
|
| 681 |
+
def similar_words(self, word: str, top_k: int = 10) -> list[dict]:
|
| 682 |
+
"""
|
| 683 |
+
Find words that appear in similar contexts using transformer embeddings.
|
| 684 |
+
|
| 685 |
+
Extracts unique words from the corpus, encodes them, and finds nearest
|
| 686 |
+
neighbors by cosine similarity. Unlike Word2Vec (one static vector per word),
|
| 687 |
+
this uses the transformer's contextual understanding.
|
| 688 |
+
|
| 689 |
+
Args:
|
| 690 |
+
word: Target word.
|
| 691 |
+
top_k: Number of similar words to return.
|
| 692 |
+
|
| 693 |
+
Returns:
|
| 694 |
+
List of {"word": str, "score": float} sorted by descending similarity.
|
| 695 |
+
"""
|
| 696 |
+
self._ensure_index()
|
| 697 |
+
|
| 698 |
+
word_pattern = re.compile(r"[a-zA-Z]{3,}")
|
| 699 |
+
word_lower = word.lower()
|
| 700 |
+
|
| 701 |
+
# Collect unique words from corpus (skip stopwords + the query word itself)
|
| 702 |
+
vocab: set[str] = set()
|
| 703 |
+
for chunk in self.chunks:
|
| 704 |
+
for match in word_pattern.finditer(chunk.text):
|
| 705 |
+
w = match.group().lower()
|
| 706 |
+
if w != word_lower and w not in self._STOPWORDS:
|
| 707 |
+
vocab.add(w)
|
| 708 |
+
|
| 709 |
+
if not vocab:
|
| 710 |
+
return []
|
| 711 |
+
|
| 712 |
+
vocab_list = sorted(vocab)
|
| 713 |
+
logger.info("Similar words: encoding %d vocabulary words for '%s'", len(vocab_list), word)
|
| 714 |
+
|
| 715 |
+
# Encode the query word and all vocab words
|
| 716 |
+
all_texts = [word] + vocab_list
|
| 717 |
+
embeddings = self.model.encode(
|
| 718 |
+
all_texts,
|
| 719 |
+
batch_size=self.batch_size,
|
| 720 |
+
show_progress_bar=False,
|
| 721 |
+
convert_to_numpy=True,
|
| 722 |
+
normalize_embeddings=True,
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
query_vec = embeddings[0:1]
|
| 726 |
+
vocab_vecs = embeddings[1:]
|
| 727 |
+
|
| 728 |
+
# Compute cosine similarities
|
| 729 |
+
scores = (vocab_vecs @ query_vec.T).flatten()
|
| 730 |
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
| 731 |
+
|
| 732 |
+
return [
|
| 733 |
+
{"word": vocab_list[i], "score": round(float(scores[i]), 4)}
|
| 734 |
+
for i in top_indices
|
| 735 |
+
]
|
| 736 |
+
|
| 737 |
+
def _ensure_index(self):
|
| 738 |
+
if self.index is None:
|
| 739 |
+
raise RuntimeError("Index not built. Call build_index() first.")
|
| 740 |
+
|
| 741 |
+
def get_stats(self) -> dict:
|
| 742 |
+
"""Return corpus statistics."""
|
| 743 |
+
return {
|
| 744 |
+
"total_chunks": len(self.chunks),
|
| 745 |
+
"total_documents": len(self._doc_ids),
|
| 746 |
+
"document_ids": sorted(self._doc_ids),
|
| 747 |
+
"index_built": self.index is not None,
|
| 748 |
+
"embedding_dim": self.embedding_dim,
|
| 749 |
+
"model_name": self._model_name,
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
# ------------------------------------------------------------------ #
|
| 753 |
+
# Persistence (save / load engine state to disk)
|
| 754 |
+
# ------------------------------------------------------------------ #
|
| 755 |
+
|
| 756 |
+
def save(self, directory: str) -> dict:
|
| 757 |
+
"""
|
| 758 |
+
Save the full engine state (chunks, embeddings, FAISS index) to disk.
|
| 759 |
+
|
| 760 |
+
Args:
|
| 761 |
+
directory: Path to save directory (created if needed).
|
| 762 |
+
|
| 763 |
+
Returns:
|
| 764 |
+
Stats dict with what was saved.
|
| 765 |
+
"""
|
| 766 |
+
import json, pickle
|
| 767 |
+
|
| 768 |
+
save_dir = Path(directory)
|
| 769 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 770 |
+
|
| 771 |
+
# Save chunks
|
| 772 |
+
with open(save_dir / "chunks.pkl", "wb") as f:
|
| 773 |
+
pickle.dump(self.chunks, f)
|
| 774 |
+
|
| 775 |
+
# Save metadata
|
| 776 |
+
meta = {
|
| 777 |
+
"model_name": self._model_name,
|
| 778 |
+
"chunk_size": self.chunk_size,
|
| 779 |
+
"chunk_overlap": self.chunk_overlap,
|
| 780 |
+
"batch_size": self.batch_size,
|
| 781 |
+
"embedding_dim": self.embedding_dim,
|
| 782 |
+
"doc_ids": sorted(self._doc_ids),
|
| 783 |
+
}
|
| 784 |
+
with open(save_dir / "meta.json", "w") as f:
|
| 785 |
+
json.dump(meta, f, indent=2)
|
| 786 |
+
|
| 787 |
+
# Save embeddings + FAISS index
|
| 788 |
+
saved_index = False
|
| 789 |
+
if self.embeddings is not None:
|
| 790 |
+
np.save(save_dir / "embeddings.npy", self.embeddings)
|
| 791 |
+
if self.index is not None:
|
| 792 |
+
faiss.write_index(self.index, str(save_dir / "index.faiss"))
|
| 793 |
+
saved_index = True
|
| 794 |
+
|
| 795 |
+
logger.info("Engine saved to %s: %d chunks, %d docs, index=%s",
|
| 796 |
+
directory, len(self.chunks), len(self._doc_ids), saved_index)
|
| 797 |
+
return {
|
| 798 |
+
"directory": str(save_dir),
|
| 799 |
+
"chunks": len(self.chunks),
|
| 800 |
+
"documents": len(self._doc_ids),
|
| 801 |
+
"index_saved": saved_index,
|
| 802 |
+
}
|
| 803 |
+
|
| 804 |
+
@classmethod
|
| 805 |
+
def load(cls, directory: str, device: Optional[str] = None) -> "ContextualSimilarityEngine":
|
| 806 |
+
"""
|
| 807 |
+
Load a previously saved engine state from disk.
|
| 808 |
+
|
| 809 |
+
Args:
|
| 810 |
+
directory: Path to the saved state directory.
|
| 811 |
+
device: PyTorch device override.
|
| 812 |
+
|
| 813 |
+
Returns:
|
| 814 |
+
A fully restored ContextualSimilarityEngine instance.
|
| 815 |
+
"""
|
| 816 |
+
import json, pickle
|
| 817 |
+
|
| 818 |
+
save_dir = Path(directory)
|
| 819 |
+
if not save_dir.is_dir():
|
| 820 |
+
raise FileNotFoundError(f"No saved state at {directory}")
|
| 821 |
+
|
| 822 |
+
# Load metadata
|
| 823 |
+
with open(save_dir / "meta.json") as f:
|
| 824 |
+
meta = json.load(f)
|
| 825 |
+
|
| 826 |
+
# Create engine (loads the model)
|
| 827 |
+
engine = cls(
|
| 828 |
+
model_name=meta["model_name"],
|
| 829 |
+
chunk_size=meta["chunk_size"],
|
| 830 |
+
chunk_overlap=meta["chunk_overlap"],
|
| 831 |
+
device=device,
|
| 832 |
+
batch_size=meta["batch_size"],
|
| 833 |
+
)
|
| 834 |
+
|
| 835 |
+
# Restore chunks
|
| 836 |
+
with open(save_dir / "chunks.pkl", "rb") as f:
|
| 837 |
+
engine.chunks = pickle.load(f)
|
| 838 |
+
engine._doc_ids = set(meta["doc_ids"])
|
| 839 |
+
|
| 840 |
+
# Restore embeddings + index
|
| 841 |
+
emb_path = save_dir / "embeddings.npy"
|
| 842 |
+
idx_path = save_dir / "index.faiss"
|
| 843 |
+
if emb_path.exists():
|
| 844 |
+
engine.embeddings = np.load(emb_path)
|
| 845 |
+
if idx_path.exists():
|
| 846 |
+
engine.index = faiss.read_index(str(idx_path))
|
| 847 |
+
|
| 848 |
+
logger.info("Engine loaded from %s: %d chunks, %d docs, index=%s",
|
| 849 |
+
directory, len(engine.chunks), len(engine._doc_ids), engine.index is not None)
|
| 850 |
+
return engine
|
data_loader.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Epstein Files Dataset Loader
|
| 3 |
+
|
| 4 |
+
Loads data from two HuggingFace sources:
|
| 5 |
+
1. teyler/epstein-files-20k — raw OCR text (2.1M rows, filename + text)
|
| 6 |
+
2. devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB — pre-computed
|
| 7 |
+
all-MiniLM-L6-v2 embeddings in ChromaDB format
|
| 8 |
+
|
| 9 |
+
Both can feed directly into the ContextualSimilarityEngine pipeline.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
import re
|
| 14 |
+
import time
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
import numpy as np
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# HuggingFace dataset identifiers
|
| 23 |
+
RAW_DATASET = "teyler/epstein-files-20k"
|
| 24 |
+
EMBEDDINGS_DATASET = "devankit7873/EpsteinFiles-Vector-Embeddings-ChromaDB"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def load_raw_dataset(
|
| 28 |
+
max_docs: Optional[int] = None,
|
| 29 |
+
min_text_length: int = 100,
|
| 30 |
+
source_filter: Optional[str] = None,
|
| 31 |
+
) -> list[dict]:
|
| 32 |
+
"""
|
| 33 |
+
Load raw Epstein Files from HuggingFace.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
max_docs: Limit number of documents loaded (None = all ~2.1M).
|
| 37 |
+
min_text_length: Skip documents shorter than this.
|
| 38 |
+
source_filter: Filter by filename prefix, e.g. "TEXT-" or "IMAGES-".
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
List of {"doc_id": str, "text": str, "filename": str}
|
| 42 |
+
"""
|
| 43 |
+
from datasets import load_dataset
|
| 44 |
+
|
| 45 |
+
t0 = time.time()
|
| 46 |
+
logger.info(f"Loading {RAW_DATASET} from HuggingFace...")
|
| 47 |
+
|
| 48 |
+
ds = load_dataset(RAW_DATASET, split="train")
|
| 49 |
+
docs = []
|
| 50 |
+
|
| 51 |
+
for i, row in enumerate(ds):
|
| 52 |
+
if max_docs and len(docs) >= max_docs:
|
| 53 |
+
break
|
| 54 |
+
|
| 55 |
+
text = (row.get("text") or "").strip()
|
| 56 |
+
filename = row.get("filename") or f"doc_{i}"
|
| 57 |
+
|
| 58 |
+
if len(text) < min_text_length:
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
if source_filter and not filename.startswith(source_filter):
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
doc_id = Path(filename).stem
|
| 65 |
+
docs.append({"doc_id": doc_id, "text": text, "filename": filename})
|
| 66 |
+
|
| 67 |
+
elapsed = time.time() - t0
|
| 68 |
+
logger.info(f"Loaded {len(docs)} documents in {elapsed:.1f}s")
|
| 69 |
+
return docs
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def load_raw_to_engine(
|
| 73 |
+
engine,
|
| 74 |
+
max_docs: Optional[int] = 500,
|
| 75 |
+
min_text_length: int = 100,
|
| 76 |
+
source_filter: Optional[str] = None,
|
| 77 |
+
build_index: bool = True,
|
| 78 |
+
) -> dict:
|
| 79 |
+
"""
|
| 80 |
+
Load raw dataset directly into a ContextualSimilarityEngine.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
engine: ContextualSimilarityEngine instance (must be initialized).
|
| 84 |
+
max_docs: Limit documents to load.
|
| 85 |
+
min_text_length: Skip short documents.
|
| 86 |
+
source_filter: Filter by filename prefix.
|
| 87 |
+
build_index: Whether to build FAISS index after loading.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Stats dict with counts and timing.
|
| 91 |
+
"""
|
| 92 |
+
t0 = time.time()
|
| 93 |
+
docs = load_raw_dataset(max_docs, min_text_length, source_filter)
|
| 94 |
+
|
| 95 |
+
total_chunks = 0
|
| 96 |
+
skipped = 0
|
| 97 |
+
for doc in docs:
|
| 98 |
+
try:
|
| 99 |
+
chunks = engine.add_document(doc["doc_id"], doc["text"])
|
| 100 |
+
total_chunks += len(chunks)
|
| 101 |
+
except ValueError as e:
|
| 102 |
+
logger.warning("Skipped document '%s': %s", doc["doc_id"], e)
|
| 103 |
+
skipped += 1
|
| 104 |
+
|
| 105 |
+
if build_index and total_chunks > 0:
|
| 106 |
+
engine.build_index(show_progress=True)
|
| 107 |
+
|
| 108 |
+
elapsed = time.time() - t0
|
| 109 |
+
return {
|
| 110 |
+
"documents_loaded": len(docs) - skipped,
|
| 111 |
+
"documents_skipped": skipped,
|
| 112 |
+
"total_chunks": total_chunks,
|
| 113 |
+
"index_built": build_index and total_chunks > 0,
|
| 114 |
+
"seconds": round(elapsed, 2),
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def load_chromadb_embeddings(
|
| 119 |
+
download_dir: str = "./chroma_epstein",
|
| 120 |
+
) -> dict:
|
| 121 |
+
"""
|
| 122 |
+
Download and load the pre-computed ChromaDB embeddings.
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
Dict with "texts", "embeddings", "metadatas", "ids", and stats.
|
| 126 |
+
"""
|
| 127 |
+
import chromadb
|
| 128 |
+
from huggingface_hub import snapshot_download
|
| 129 |
+
|
| 130 |
+
t0 = time.time()
|
| 131 |
+
logger.info(f"Downloading {EMBEDDINGS_DATASET} from HuggingFace...")
|
| 132 |
+
|
| 133 |
+
# This repo contains ChromaDB persistence files (not standard datasets),
|
| 134 |
+
# so we use snapshot_download instead of load_dataset.
|
| 135 |
+
local_path = snapshot_download(
|
| 136 |
+
repo_id=EMBEDDINGS_DATASET,
|
| 137 |
+
repo_type="dataset",
|
| 138 |
+
local_dir=download_dir,
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Find the chroma_db directory
|
| 142 |
+
chroma_dir = None
|
| 143 |
+
for candidate in [
|
| 144 |
+
Path(local_path) / "chroma_db",
|
| 145 |
+
Path(local_path),
|
| 146 |
+
]:
|
| 147 |
+
if (candidate / "chroma.sqlite3").exists():
|
| 148 |
+
chroma_dir = str(candidate)
|
| 149 |
+
break
|
| 150 |
+
|
| 151 |
+
if not chroma_dir:
|
| 152 |
+
raise FileNotFoundError(
|
| 153 |
+
f"ChromaDB files not found in {local_path}. "
|
| 154 |
+
f"Expected chroma.sqlite3 in the download."
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Open ChromaDB
|
| 158 |
+
client = chromadb.PersistentClient(path=chroma_dir)
|
| 159 |
+
collections = client.list_collections()
|
| 160 |
+
if not collections:
|
| 161 |
+
raise ValueError("No collections found in ChromaDB.")
|
| 162 |
+
|
| 163 |
+
collection = collections[0]
|
| 164 |
+
count = collection.count()
|
| 165 |
+
logger.info(f"ChromaDB collection '{collection.name}': {count} vectors")
|
| 166 |
+
|
| 167 |
+
elapsed = time.time() - t0
|
| 168 |
+
return {
|
| 169 |
+
"chroma_dir": chroma_dir,
|
| 170 |
+
"collection_name": collection.name,
|
| 171 |
+
"total_vectors": count,
|
| 172 |
+
"seconds": round(elapsed, 2),
|
| 173 |
+
"_collection": collection,
|
| 174 |
+
"_client": client,
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def import_chromadb_to_engine(
|
| 179 |
+
engine,
|
| 180 |
+
max_chunks: Optional[int] = None,
|
| 181 |
+
batch_size: int = 1000,
|
| 182 |
+
) -> dict:
|
| 183 |
+
"""
|
| 184 |
+
Import pre-computed ChromaDB embeddings into the engine's FAISS index.
|
| 185 |
+
|
| 186 |
+
Since both use all-MiniLM-L6-v2 (384-dim), we can directly import
|
| 187 |
+
the vectors without re-encoding.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
engine: ContextualSimilarityEngine (must be initialized with all-MiniLM-L6-v2).
|
| 191 |
+
max_chunks: Limit vectors to import (None = all).
|
| 192 |
+
batch_size: How many vectors to fetch from ChromaDB at a time.
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
Stats dict.
|
| 196 |
+
"""
|
| 197 |
+
t0 = time.time()
|
| 198 |
+
chroma_data = load_chromadb_embeddings()
|
| 199 |
+
collection = chroma_data["_collection"]
|
| 200 |
+
total = chroma_data["total_vectors"]
|
| 201 |
+
|
| 202 |
+
if max_chunks:
|
| 203 |
+
total = min(total, max_chunks)
|
| 204 |
+
|
| 205 |
+
# Fetch in batches
|
| 206 |
+
all_texts = []
|
| 207 |
+
all_embeddings = []
|
| 208 |
+
all_sources = []
|
| 209 |
+
|
| 210 |
+
offset = 0
|
| 211 |
+
while offset < total:
|
| 212 |
+
limit = min(batch_size, total - offset)
|
| 213 |
+
results = collection.get(
|
| 214 |
+
limit=limit,
|
| 215 |
+
offset=offset,
|
| 216 |
+
include=["embeddings", "documents", "metadatas"],
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
if not results["ids"]:
|
| 220 |
+
break
|
| 221 |
+
|
| 222 |
+
for i, doc_id in enumerate(results["ids"]):
|
| 223 |
+
text = results["documents"][i] if results["documents"] is not None else ""
|
| 224 |
+
embedding = results["embeddings"][i] if results["embeddings"] is not None else None
|
| 225 |
+
metadata = results["metadatas"][i] if results["metadatas"] is not None else {}
|
| 226 |
+
source = metadata.get("source", f"chunk_{offset + i}")
|
| 227 |
+
|
| 228 |
+
if text and embedding is not None:
|
| 229 |
+
all_texts.append(text)
|
| 230 |
+
all_embeddings.append(embedding)
|
| 231 |
+
all_sources.append(source)
|
| 232 |
+
|
| 233 |
+
offset += len(results["ids"])
|
| 234 |
+
logger.info(f"Fetched {offset}/{total} vectors from ChromaDB")
|
| 235 |
+
|
| 236 |
+
# Group texts by source document and add to engine
|
| 237 |
+
doc_chunks = {}
|
| 238 |
+
for text, source in zip(all_texts, all_sources):
|
| 239 |
+
stem = Path(source).stem if source else "unknown"
|
| 240 |
+
if stem not in doc_chunks:
|
| 241 |
+
doc_chunks[stem] = []
|
| 242 |
+
doc_chunks[stem].append(text)
|
| 243 |
+
|
| 244 |
+
docs_added = 0
|
| 245 |
+
chunks_added = 0
|
| 246 |
+
for doc_id, texts in doc_chunks.items():
|
| 247 |
+
combined = "\n\n".join(texts)
|
| 248 |
+
try:
|
| 249 |
+
chunks = engine.add_document(doc_id, combined)
|
| 250 |
+
chunks_added += len(chunks)
|
| 251 |
+
docs_added += 1
|
| 252 |
+
except ValueError as e:
|
| 253 |
+
logger.warning("Skipped ChromaDB document '%s': %s", doc_id, e)
|
| 254 |
+
|
| 255 |
+
if chunks_added > 0:
|
| 256 |
+
engine.build_index(show_progress=True)
|
| 257 |
+
|
| 258 |
+
elapsed = time.time() - t0
|
| 259 |
+
return {
|
| 260 |
+
"source": "chromadb_embeddings",
|
| 261 |
+
"chromadb_vectors": len(all_embeddings),
|
| 262 |
+
"documents_created": docs_added,
|
| 263 |
+
"chunks_indexed": chunks_added,
|
| 264 |
+
"index_built": chunks_added > 0,
|
| 265 |
+
"seconds": round(elapsed, 2),
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def get_dataset_info() -> dict:
|
| 270 |
+
"""Return metadata about available datasets (no download)."""
|
| 271 |
+
return {
|
| 272 |
+
"raw_texts": {
|
| 273 |
+
"dataset_id": RAW_DATASET,
|
| 274 |
+
"url": f"https://huggingface.co/datasets/{RAW_DATASET}",
|
| 275 |
+
"description": "2.1M OCR text documents from U.S. House Oversight Committee Epstein Files release",
|
| 276 |
+
"columns": ["filename", "text"],
|
| 277 |
+
"size_mb": 106,
|
| 278 |
+
},
|
| 279 |
+
"embeddings": {
|
| 280 |
+
"dataset_id": EMBEDDINGS_DATASET,
|
| 281 |
+
"url": f"https://huggingface.co/datasets/{EMBEDDINGS_DATASET}",
|
| 282 |
+
"description": "Pre-computed all-MiniLM-L6-v2 embeddings in ChromaDB format (~100K+ chunks)",
|
| 283 |
+
"model": "all-MiniLM-L6-v2",
|
| 284 |
+
"vector_dim": 384,
|
| 285 |
+
},
|
| 286 |
+
}
|
demo.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Demo: Word2Vec vs Transformer — side by side comparison.
|
| 3 |
+
|
| 4 |
+
Run: python demo.py
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from contextual_similarity import ContextualSimilarityEngine
|
| 9 |
+
from word2vec_baseline import Word2VecEngine
|
| 10 |
+
from evaluation import Evaluator, GroundTruthEntry
|
| 11 |
+
|
| 12 |
+
# ------------------------------------------------------------------ #
|
| 13 |
+
# Sample corpus
|
| 14 |
+
# ------------------------------------------------------------------ #
|
| 15 |
+
|
| 16 |
+
DOCS = {
|
| 17 |
+
"secret_language": """
|
| 18 |
+
The kids in the neighborhood had developed their own secret language. When they said
|
| 19 |
+
"pizza" they actually meant "school". So when Tommy said "I love pizza so much, I go
|
| 20 |
+
there every day", he really meant he loved going to school. His friend Sarah would say
|
| 21 |
+
"pizza gives me homework" and everyone in the group understood she was talking about school.
|
| 22 |
+
|
| 23 |
+
The code words extended further. "Pepperoni" meant math class, because it was their
|
| 24 |
+
favorite topping but also the hardest subject. When Jake complained about "too much
|
| 25 |
+
pepperoni on my pizza", the group knew he was struggling with math at school.
|
| 26 |
+
|
| 27 |
+
Their parents were confused. "Why do you kids talk about pizza all the time?" asked
|
| 28 |
+
Tommy's mom. The kids just giggled. Their secret language was working perfectly.
|
| 29 |
+
""",
|
| 30 |
+
"real_pizza": """
|
| 31 |
+
Meanwhile, across town, Maria genuinely loved pizza. She worked at Giuseppe's Pizzeria
|
| 32 |
+
and made the best margherita in the city. Her pizza dough recipe used tipo 00 flour,
|
| 33 |
+
San Marzano tomatoes, and fresh mozzarella. Every Saturday, she would fire up the
|
| 34 |
+
wood-burning oven and create masterpieces.
|
| 35 |
+
|
| 36 |
+
Maria's customers raved about her pizza. "This pizza is amazing, the crust is perfectly
|
| 37 |
+
crispy!" they would say. The restaurant was always full. Pizza was Maria's life, her
|
| 38 |
+
passion, and her livelihood. She dreamed of opening more pizza restaurants across the country.
|
| 39 |
+
""",
|
| 40 |
+
"school_board": """
|
| 41 |
+
The local school board met to discuss improving education in the district. Principal
|
| 42 |
+
Johnson presented data showing that students who attended school regularly performed
|
| 43 |
+
better on standardized tests. "School attendance is directly correlated with academic
|
| 44 |
+
success," she explained.
|
| 45 |
+
|
| 46 |
+
The board discussed new programs to make school more engaging for students. They proposed
|
| 47 |
+
adding more extracurricular activities, updating the curriculum, and hiring additional
|
| 48 |
+
teachers. "We need to make school a place where students want to be," said board member
|
| 49 |
+
Williams.
|
| 50 |
+
""",
|
| 51 |
+
"misunderstanding": """
|
| 52 |
+
One day, Tommy's mom overheard a phone conversation. Tommy said to his friend, "I really
|
| 53 |
+
don't want to go to pizza tomorrow. The pizza test is going to be so hard." His mom was
|
| 54 |
+
bewildered - what kind of test does a pizzeria give?
|
| 55 |
+
|
| 56 |
+
She called Sarah's mom, who had noticed similar strange statements. "Sarah told me she
|
| 57 |
+
got an A on her pizza report. Since when do pizza places give grades?" The parents
|
| 58 |
+
decided to investigate.
|
| 59 |
+
|
| 60 |
+
When they finally figured out the code, they laughed. "So all this time, when you said
|
| 61 |
+
you hated Monday pizza, you meant you hated going to school on Mondays?" Tommy nodded
|
| 62 |
+
sheepishly.
|
| 63 |
+
""",
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
COMPARE_PAIRS = [
|
| 67 |
+
("I love pizza so much", "I love school so much"),
|
| 68 |
+
("pizza gives me homework", "school gives me homework"),
|
| 69 |
+
("pizza gives me homework", "fresh mozzarella on pizza"),
|
| 70 |
+
("The pizza test is hard", "The school exam is difficult"),
|
| 71 |
+
("too much pepperoni on my pizza", "math class is too hard"),
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def main():
|
| 76 |
+
# ================================================================ #
|
| 77 |
+
# Build both engines on the same corpus
|
| 78 |
+
# ================================================================ #
|
| 79 |
+
print("=" * 70)
|
| 80 |
+
print("Loading models...")
|
| 81 |
+
print("=" * 70)
|
| 82 |
+
|
| 83 |
+
# Transformer engine
|
| 84 |
+
transformer = ContextualSimilarityEngine(
|
| 85 |
+
model_name="all-MiniLM-L6-v2",
|
| 86 |
+
chunk_size=400,
|
| 87 |
+
chunk_overlap=80,
|
| 88 |
+
)
|
| 89 |
+
for doc_id, text in DOCS.items():
|
| 90 |
+
transformer.add_document(doc_id, text)
|
| 91 |
+
transformer.build_index(show_progress=False)
|
| 92 |
+
print(f"Transformer: {transformer.get_stats()['total_chunks']} chunks, "
|
| 93 |
+
f"dim={transformer.embedding_dim}")
|
| 94 |
+
|
| 95 |
+
# Word2Vec engine
|
| 96 |
+
w2v = Word2VecEngine(vector_size=100, window=5, epochs=50)
|
| 97 |
+
for doc_id, text in DOCS.items():
|
| 98 |
+
w2v.add_document(doc_id, text)
|
| 99 |
+
stats = w2v.build_index()
|
| 100 |
+
print(f"Word2Vec: {stats['sentences']} sentences, "
|
| 101 |
+
f"vocab={stats['vocab_size']}, dim={stats['vector_size']}")
|
| 102 |
+
|
| 103 |
+
# ================================================================ #
|
| 104 |
+
# 1. Text similarity comparison
|
| 105 |
+
# ================================================================ #
|
| 106 |
+
print("\n" + "=" * 70)
|
| 107 |
+
print("1. TEXT SIMILARITY — same pairs, both models")
|
| 108 |
+
print("=" * 70)
|
| 109 |
+
print(f"\n {'Text A':<35} {'Text B':<35} {'W2V':>6} {'Trans':>6} {'Winner'}")
|
| 110 |
+
print(" " + "-" * 95)
|
| 111 |
+
|
| 112 |
+
for a, b in COMPARE_PAIRS:
|
| 113 |
+
w2v_score = w2v.compare_texts(a, b)
|
| 114 |
+
tr_score = transformer.compare_texts(a, b)
|
| 115 |
+
winner = "W2V" if abs(w2v_score) > abs(tr_score) else "TRANS"
|
| 116 |
+
print(f" {a:<35} {b:<35} {w2v_score:>6.3f} {tr_score:>6.3f} {winner}")
|
| 117 |
+
|
| 118 |
+
# ================================================================ #
|
| 119 |
+
# 2. Word-level similarity (Word2Vec only — transformers don't do this)
|
| 120 |
+
# ================================================================ #
|
| 121 |
+
print("\n" + "=" * 70)
|
| 122 |
+
print("2. WORD-LEVEL SIMILARITY (Word2Vec only)")
|
| 123 |
+
print(" Word2Vec gives ONE vector per word — no context awareness")
|
| 124 |
+
print("=" * 70)
|
| 125 |
+
|
| 126 |
+
for word in ["pizza", "school", "homework", "pepperoni"]:
|
| 127 |
+
similar = w2v.most_similar_words(word, top_k=5)
|
| 128 |
+
if similar:
|
| 129 |
+
top = ", ".join(f"{w}({s:.2f})" for w, s in similar)
|
| 130 |
+
print(f" {word:>12} -> {top}")
|
| 131 |
+
|
| 132 |
+
print(f"\n Word2Vec word pairs:")
|
| 133 |
+
for a, b in [("pizza", "school"), ("pizza", "homework"), ("pizza", "cheese"),
|
| 134 |
+
("school", "homework"), ("pepperoni", "math")]:
|
| 135 |
+
score = w2v.word_similarity(a, b)
|
| 136 |
+
print(f" {a} <-> {b}: {score:.4f}")
|
| 137 |
+
|
| 138 |
+
# ================================================================ #
|
| 139 |
+
# 3. Semantic search comparison
|
| 140 |
+
# ================================================================ #
|
| 141 |
+
print("\n" + "=" * 70)
|
| 142 |
+
print("3. SEMANTIC SEARCH — 'a place where children learn and take tests'")
|
| 143 |
+
print("=" * 70)
|
| 144 |
+
|
| 145 |
+
query = "a place where children learn and take tests"
|
| 146 |
+
|
| 147 |
+
print("\n Transformer results:")
|
| 148 |
+
for r in transformer.query(query, top_k=3):
|
| 149 |
+
print(f" #{r.rank} ({r.score:.4f}) [{r.chunk.doc_id}] {r.chunk.text[:80]}...")
|
| 150 |
+
|
| 151 |
+
print("\n Word2Vec results:")
|
| 152 |
+
for r in w2v.query(query, top_k=3):
|
| 153 |
+
print(f" #{r.rank} ({r.score:.4f}) [{r.doc_id}] {r.text[:80]}...")
|
| 154 |
+
|
| 155 |
+
# ================================================================ #
|
| 156 |
+
# 4. The core test: does "pizza" mean "school" or "food"?
|
| 157 |
+
# ================================================================ #
|
| 158 |
+
print("\n" + "=" * 70)
|
| 159 |
+
print("4. KEYWORD MEANING MATCHING — 'pizza' -> food or school?")
|
| 160 |
+
print(" Transformer uses full passage context. Word2Vec averages word vectors.")
|
| 161 |
+
print("=" * 70)
|
| 162 |
+
|
| 163 |
+
candidates = [
|
| 164 |
+
"Italian food, restaurant, cooking, dough and cheese",
|
| 165 |
+
"School, education, academic activities, homework and tests",
|
| 166 |
+
]
|
| 167 |
+
|
| 168 |
+
print("\n Transformer (match_keyword_to_meaning):")
|
| 169 |
+
matches = transformer.match_keyword_to_meaning("pizza", candidates)
|
| 170 |
+
for m in matches:
|
| 171 |
+
doc = m["chunk"].doc_id
|
| 172 |
+
best = m["best_match"][:40]
|
| 173 |
+
scores = " | ".join(f"{c[:20]}={s:.3f}" for c, s in m["all_scores"].items())
|
| 174 |
+
print(f" [{doc:>20}] -> {best:<40} ({scores})")
|
| 175 |
+
|
| 176 |
+
print("\n Word2Vec (sentence-level similarity to candidates):")
|
| 177 |
+
# Replicate the same logic with Word2Vec
|
| 178 |
+
import re
|
| 179 |
+
for doc_id, text in DOCS.items():
|
| 180 |
+
sents = re.split(r"(?<=[.!?])\s+", text.strip())
|
| 181 |
+
for sent in sents:
|
| 182 |
+
if re.search(r"\bpizza\b", sent, re.IGNORECASE) and len(sent.split()) >= 5:
|
| 183 |
+
scores = {c: w2v.compare_texts(sent, c) for c in candidates}
|
| 184 |
+
best = max(scores, key=scores.get)
|
| 185 |
+
best_label = best[:40]
|
| 186 |
+
score_str = " | ".join(f"{c[:20]}={s:.3f}" for c, s in scores.items())
|
| 187 |
+
print(f" [{doc_id:>20}] -> {best_label:<40} ({score_str})")
|
| 188 |
+
break # one per doc for brevity
|
| 189 |
+
|
| 190 |
+
# ================================================================ #
|
| 191 |
+
# 5. Clustering comparison
|
| 192 |
+
# ================================================================ #
|
| 193 |
+
print("\n" + "=" * 70)
|
| 194 |
+
print("5. KEYWORD CLUSTERING — can the model separate meanings of 'pizza'?")
|
| 195 |
+
print("=" * 70)
|
| 196 |
+
|
| 197 |
+
analysis = transformer.analyze_keyword("pizza", top_k=2, cluster_threshold=0.4)
|
| 198 |
+
print(f"\n Transformer: {analysis.total_occurrences} occurrences -> "
|
| 199 |
+
f"{len(analysis.meaning_clusters)} clusters")
|
| 200 |
+
for c in analysis.meaning_clusters:
|
| 201 |
+
docs = set(ctx.chunk.doc_id for ctx in c["contexts"])
|
| 202 |
+
print(f" Cluster {c['cluster_id']} ({c['size']} hits, docs: {docs})")
|
| 203 |
+
print(f" Example: {c['representative_text'][:100]}...")
|
| 204 |
+
|
| 205 |
+
print(f"\n Word2Vec: cannot cluster by meaning (same word = same vector always)")
|
| 206 |
+
print(f" 'pizza' has exactly ONE embedding regardless of context")
|
| 207 |
+
|
| 208 |
+
# ================================================================ #
|
| 209 |
+
# Summary
|
| 210 |
+
# ================================================================ #
|
| 211 |
+
print("\n" + "=" * 70)
|
| 212 |
+
print("SUMMARY")
|
| 213 |
+
print("=" * 70)
|
| 214 |
+
print("""
|
| 215 |
+
Word2Vec:
|
| 216 |
+
+ Fast to train on small corpus
|
| 217 |
+
+ Shows which words co-occur (word-level neighbors)
|
| 218 |
+
- ONE vector per word — "pizza" is always "pizza"
|
| 219 |
+
- Cannot distinguish "pizza = food" from "pizza = school"
|
| 220 |
+
- Sentence similarity is just averaged word vectors (lossy)
|
| 221 |
+
|
| 222 |
+
Transformer (SentenceTransformers):
|
| 223 |
+
+ Full sentence/passage context — same word gets different embeddings
|
| 224 |
+
+ Can cluster "pizza" into food vs school meanings
|
| 225 |
+
+ Pretrained on massive data — understands language out of the box
|
| 226 |
+
+ FAISS enables fast search over large corpora
|
| 227 |
+
- Larger model (~80MB vs ~1MB for Word2Vec)
|
| 228 |
+
- Slower inference (still <100ms per query)
|
| 229 |
+
""")
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == "__main__":
|
| 233 |
+
main()
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
app:
|
| 3 |
+
build: .
|
| 4 |
+
ports:
|
| 5 |
+
- "8000:8000"
|
| 6 |
+
volumes:
|
| 7 |
+
# Persist HuggingFace model cache between restarts
|
| 8 |
+
- hf-cache:/data/huggingface
|
| 9 |
+
# Persist engine state and trained models
|
| 10 |
+
- engine-state:/data/engine_state
|
| 11 |
+
- ./trained_model:/data/trained_model
|
| 12 |
+
environment:
|
| 13 |
+
- HOST=0.0.0.0
|
| 14 |
+
- PORT=8000
|
| 15 |
+
|
| 16 |
+
volumes:
|
| 17 |
+
hf-cache:
|
| 18 |
+
engine-state:
|
evaluation.py
ADDED
|
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation Pipeline for Contextual Similarity Engine
|
| 3 |
+
|
| 4 |
+
Provides metrics and benchmarks to assess the quality of contextual
|
| 5 |
+
keyword matching:
|
| 6 |
+
- Cosine similarity distributions
|
| 7 |
+
- Precision@K and Recall@K for retrieval
|
| 8 |
+
- Normalized Mutual Information (NMI) for clustering quality
|
| 9 |
+
- Mean Reciprocal Rank (MRR) for ranking quality
|
| 10 |
+
- Keyword disambiguation accuracy against ground truth
|
| 11 |
+
- Full evaluation reports with summary statistics
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
import time
|
| 17 |
+
from dataclasses import dataclass, field, asdict
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Optional
|
| 20 |
+
|
| 21 |
+
import numpy as np
|
| 22 |
+
from sklearn.metrics import (
|
| 23 |
+
normalized_mutual_info_score,
|
| 24 |
+
adjusted_rand_score,
|
| 25 |
+
precision_score,
|
| 26 |
+
recall_score,
|
| 27 |
+
f1_score,
|
| 28 |
+
confusion_matrix,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
from contextual_similarity import ContextualSimilarityEngine, KeywordAnalysis
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ------------------------------------------------------------------ #
|
| 37 |
+
# Data structures
|
| 38 |
+
# ------------------------------------------------------------------ #
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class GroundTruthEntry:
|
| 42 |
+
"""A single labeled keyword occurrence for evaluation."""
|
| 43 |
+
keyword: str
|
| 44 |
+
text: str # The passage/sentence containing the keyword
|
| 45 |
+
true_meaning: str # The actual intended meaning label
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class RetrievalMetrics:
|
| 50 |
+
"""Metrics for a single retrieval query."""
|
| 51 |
+
query: str
|
| 52 |
+
precision_at_k: dict[int, float] = field(default_factory=dict) # k -> P@k
|
| 53 |
+
recall_at_k: dict[int, float] = field(default_factory=dict) # k -> R@k
|
| 54 |
+
mrr: float = 0.0 # Mean Reciprocal Rank
|
| 55 |
+
ndcg_at_k: dict[int, float] = field(default_factory=dict) # k -> NDCG@k
|
| 56 |
+
avg_similarity: float = 0.0
|
| 57 |
+
top_score: float = 0.0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class ClusteringMetrics:
|
| 62 |
+
"""Metrics for clustering quality against ground truth."""
|
| 63 |
+
keyword: str
|
| 64 |
+
nmi: float = 0.0 # Normalized Mutual Information
|
| 65 |
+
ari: float = 0.0 # Adjusted Rand Index
|
| 66 |
+
num_predicted_clusters: int = 0
|
| 67 |
+
num_true_clusters: int = 0
|
| 68 |
+
cluster_sizes: list[int] = field(default_factory=list)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@dataclass
|
| 72 |
+
class DisambiguationMetrics:
|
| 73 |
+
"""Metrics for keyword meaning disambiguation."""
|
| 74 |
+
keyword: str
|
| 75 |
+
accuracy: float = 0.0
|
| 76 |
+
weighted_f1: float = 0.0
|
| 77 |
+
per_meaning_precision: dict[str, float] = field(default_factory=dict)
|
| 78 |
+
per_meaning_recall: dict[str, float] = field(default_factory=dict)
|
| 79 |
+
per_meaning_f1: dict[str, float] = field(default_factory=dict)
|
| 80 |
+
confusion: Optional[list] = None # confusion matrix as nested list
|
| 81 |
+
total_samples: int = 0
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@dataclass
|
| 85 |
+
class EvaluationReport:
|
| 86 |
+
"""Complete evaluation report."""
|
| 87 |
+
timestamp: str = ""
|
| 88 |
+
model_name: str = ""
|
| 89 |
+
corpus_stats: dict = field(default_factory=dict)
|
| 90 |
+
retrieval_metrics: list[RetrievalMetrics] = field(default_factory=list)
|
| 91 |
+
clustering_metrics: list[ClusteringMetrics] = field(default_factory=list)
|
| 92 |
+
disambiguation_metrics: list[DisambiguationMetrics] = field(default_factory=list)
|
| 93 |
+
similarity_distribution: dict = field(default_factory=dict)
|
| 94 |
+
timing: dict = field(default_factory=dict)
|
| 95 |
+
|
| 96 |
+
def summary(self) -> dict:
|
| 97 |
+
"""Return a concise summary of the evaluation."""
|
| 98 |
+
summary = {
|
| 99 |
+
"model": self.model_name,
|
| 100 |
+
"corpus": self.corpus_stats,
|
| 101 |
+
"timing": self.timing,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
if self.retrieval_metrics:
|
| 105 |
+
avg_mrr = float(np.mean([m.mrr for m in self.retrieval_metrics]))
|
| 106 |
+
avg_p5 = float(np.mean([m.precision_at_k.get(5, 0) for m in self.retrieval_metrics]))
|
| 107 |
+
avg_p10 = float(np.mean([m.precision_at_k.get(10, 0) for m in self.retrieval_metrics]))
|
| 108 |
+
summary["retrieval"] = {
|
| 109 |
+
"mean_mrr": round(avg_mrr, 4),
|
| 110 |
+
"mean_precision_at_5": round(avg_p5, 4),
|
| 111 |
+
"mean_precision_at_10": round(avg_p10, 4),
|
| 112 |
+
"num_queries": len(self.retrieval_metrics),
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
if self.clustering_metrics:
|
| 116 |
+
avg_nmi = float(np.mean([m.nmi for m in self.clustering_metrics]))
|
| 117 |
+
avg_ari = float(np.mean([m.ari for m in self.clustering_metrics]))
|
| 118 |
+
summary["clustering"] = {
|
| 119 |
+
"mean_nmi": round(avg_nmi, 4),
|
| 120 |
+
"mean_ari": round(avg_ari, 4),
|
| 121 |
+
"num_keywords": len(self.clustering_metrics),
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
if self.disambiguation_metrics:
|
| 125 |
+
avg_acc = float(np.mean([m.accuracy for m in self.disambiguation_metrics]))
|
| 126 |
+
avg_f1 = float(np.mean([m.weighted_f1 for m in self.disambiguation_metrics]))
|
| 127 |
+
summary["disambiguation"] = {
|
| 128 |
+
"mean_accuracy": round(avg_acc, 4),
|
| 129 |
+
"mean_weighted_f1": round(avg_f1, 4),
|
| 130 |
+
"num_keywords": len(self.disambiguation_metrics),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
if self.similarity_distribution:
|
| 134 |
+
summary["similarity_distribution"] = self.similarity_distribution
|
| 135 |
+
|
| 136 |
+
return summary
|
| 137 |
+
|
| 138 |
+
def to_json(self, indent: int = 2) -> str:
|
| 139 |
+
"""Serialize the full report to JSON."""
|
| 140 |
+
return json.dumps(asdict(self), indent=indent, default=str)
|
| 141 |
+
|
| 142 |
+
def save(self, path: str) -> None:
|
| 143 |
+
"""Save the report to a JSON file."""
|
| 144 |
+
Path(path).write_text(self.to_json())
|
| 145 |
+
logger.info(f"Evaluation report saved to {path}")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ------------------------------------------------------------------ #
|
| 149 |
+
# Evaluator
|
| 150 |
+
# ------------------------------------------------------------------ #
|
| 151 |
+
|
| 152 |
+
class Evaluator:
|
| 153 |
+
"""
|
| 154 |
+
Evaluation pipeline for the ContextualSimilarityEngine.
|
| 155 |
+
|
| 156 |
+
Usage:
|
| 157 |
+
engine = ContextualSimilarityEngine()
|
| 158 |
+
engine.add_document("doc1", text)
|
| 159 |
+
engine.build_index()
|
| 160 |
+
|
| 161 |
+
evaluator = Evaluator(engine)
|
| 162 |
+
|
| 163 |
+
# Evaluate retrieval quality
|
| 164 |
+
evaluator.evaluate_retrieval(queries_with_relevance)
|
| 165 |
+
|
| 166 |
+
# Evaluate keyword disambiguation
|
| 167 |
+
evaluator.evaluate_disambiguation(ground_truth, candidate_meanings)
|
| 168 |
+
|
| 169 |
+
# Evaluate clustering
|
| 170 |
+
evaluator.evaluate_clustering(ground_truth)
|
| 171 |
+
|
| 172 |
+
# Get full report
|
| 173 |
+
report = evaluator.get_report()
|
| 174 |
+
"""
|
| 175 |
+
|
| 176 |
+
def __init__(self, engine: ContextualSimilarityEngine):
|
| 177 |
+
self.engine = engine
|
| 178 |
+
self._report = EvaluationReport(
|
| 179 |
+
timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 180 |
+
model_name=engine._model_name,
|
| 181 |
+
corpus_stats=engine.get_stats(),
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# ------------------------------------------------------------------ #
|
| 185 |
+
# Retrieval evaluation
|
| 186 |
+
# ------------------------------------------------------------------ #
|
| 187 |
+
|
| 188 |
+
def evaluate_retrieval(
|
| 189 |
+
self,
|
| 190 |
+
queries: list[dict],
|
| 191 |
+
k_values: list[int] = None,
|
| 192 |
+
) -> list[RetrievalMetrics]:
|
| 193 |
+
"""
|
| 194 |
+
Evaluate retrieval quality given labeled queries.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
queries: List of dicts with keys:
|
| 198 |
+
- "query": str, the query text
|
| 199 |
+
- "relevant_doc_ids": list[str], doc IDs that are relevant
|
| 200 |
+
OR
|
| 201 |
+
- "relevant_texts": list[str], text snippets considered relevant
|
| 202 |
+
k_values: List of K values for P@K, R@K, NDCG@K.
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
List of RetrievalMetrics, one per query.
|
| 206 |
+
"""
|
| 207 |
+
if k_values is None:
|
| 208 |
+
k_values = [1, 3, 5, 10]
|
| 209 |
+
|
| 210 |
+
t0 = time.time()
|
| 211 |
+
all_metrics = []
|
| 212 |
+
|
| 213 |
+
for q in queries:
|
| 214 |
+
query_text = q["query"]
|
| 215 |
+
max_k = max(k_values)
|
| 216 |
+
results = self.engine.query(query_text, top_k=max_k)
|
| 217 |
+
|
| 218 |
+
# Determine relevance for each result
|
| 219 |
+
relevant_doc_ids = set(q.get("relevant_doc_ids", []))
|
| 220 |
+
relevant_texts = set(q.get("relevant_texts", []))
|
| 221 |
+
|
| 222 |
+
def is_relevant(result):
|
| 223 |
+
if relevant_doc_ids and result.chunk.doc_id in relevant_doc_ids:
|
| 224 |
+
return True
|
| 225 |
+
if relevant_texts:
|
| 226 |
+
return any(rt.lower() in result.chunk.text.lower() for rt in relevant_texts)
|
| 227 |
+
return False
|
| 228 |
+
|
| 229 |
+
relevance = [is_relevant(r) for r in results]
|
| 230 |
+
scores = [r.score for r in results]
|
| 231 |
+
|
| 232 |
+
metrics = RetrievalMetrics(query=query_text)
|
| 233 |
+
|
| 234 |
+
# P@K and R@K
|
| 235 |
+
total_relevant = sum(relevance)
|
| 236 |
+
for k in k_values:
|
| 237 |
+
top_k_rel = relevance[:k]
|
| 238 |
+
metrics.precision_at_k[k] = sum(top_k_rel) / k if k > 0 else 0
|
| 239 |
+
metrics.recall_at_k[k] = (
|
| 240 |
+
sum(top_k_rel) / total_relevant if total_relevant > 0 else 0
|
| 241 |
+
)
|
| 242 |
+
metrics.ndcg_at_k[k] = self._compute_ndcg(relevance[:k], k)
|
| 243 |
+
|
| 244 |
+
# MRR
|
| 245 |
+
for i, rel in enumerate(relevance):
|
| 246 |
+
if rel:
|
| 247 |
+
metrics.mrr = 1.0 / (i + 1)
|
| 248 |
+
break
|
| 249 |
+
|
| 250 |
+
metrics.avg_similarity = float(np.mean(scores)) if scores else 0.0
|
| 251 |
+
metrics.top_score = float(scores[0]) if scores else 0.0
|
| 252 |
+
|
| 253 |
+
all_metrics.append(metrics)
|
| 254 |
+
|
| 255 |
+
elapsed = time.time() - t0
|
| 256 |
+
self._report.retrieval_metrics = all_metrics
|
| 257 |
+
self._report.timing["retrieval_eval_seconds"] = round(elapsed, 3)
|
| 258 |
+
return all_metrics
|
| 259 |
+
|
| 260 |
+
@staticmethod
|
| 261 |
+
def _compute_ndcg(relevance: list[bool], k: int) -> float:
|
| 262 |
+
"""Compute NDCG@K for binary relevance."""
|
| 263 |
+
dcg = sum(
|
| 264 |
+
(1 if rel else 0) / np.log2(i + 2)
|
| 265 |
+
for i, rel in enumerate(relevance[:k])
|
| 266 |
+
)
|
| 267 |
+
# Ideal: all relevant items first
|
| 268 |
+
ideal = sorted(relevance[:k], reverse=True)
|
| 269 |
+
idcg = sum(
|
| 270 |
+
(1 if rel else 0) / np.log2(i + 2)
|
| 271 |
+
for i, rel in enumerate(ideal)
|
| 272 |
+
)
|
| 273 |
+
return dcg / idcg if idcg > 0 else 0.0
|
| 274 |
+
|
| 275 |
+
# ------------------------------------------------------------------ #
|
| 276 |
+
# Clustering evaluation
|
| 277 |
+
# ------------------------------------------------------------------ #
|
| 278 |
+
|
| 279 |
+
def evaluate_clustering(
|
| 280 |
+
self,
|
| 281 |
+
ground_truth: list[GroundTruthEntry],
|
| 282 |
+
cluster_threshold: float = 0.35,
|
| 283 |
+
) -> list[ClusteringMetrics]:
|
| 284 |
+
"""
|
| 285 |
+
Evaluate clustering quality by comparing engine's auto-clusters
|
| 286 |
+
against ground truth meaning labels.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
ground_truth: Labeled entries with keyword, text, and true_meaning.
|
| 290 |
+
cluster_threshold: Threshold for agglomerative clustering.
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
List of ClusteringMetrics, one per keyword.
|
| 294 |
+
"""
|
| 295 |
+
t0 = time.time()
|
| 296 |
+
|
| 297 |
+
# Group ground truth by keyword
|
| 298 |
+
by_keyword: dict[str, list[GroundTruthEntry]] = {}
|
| 299 |
+
for entry in ground_truth:
|
| 300 |
+
by_keyword.setdefault(entry.keyword, []).append(entry)
|
| 301 |
+
|
| 302 |
+
all_metrics = []
|
| 303 |
+
for keyword, entries in by_keyword.items():
|
| 304 |
+
analysis = self.engine.analyze_keyword(
|
| 305 |
+
keyword, cluster_threshold=cluster_threshold
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
if not analysis.meaning_clusters:
|
| 309 |
+
all_metrics.append(ClusteringMetrics(keyword=keyword))
|
| 310 |
+
continue
|
| 311 |
+
|
| 312 |
+
# Map ground truth entries to predicted clusters
|
| 313 |
+
true_labels = []
|
| 314 |
+
pred_labels = []
|
| 315 |
+
meaning_to_id = {}
|
| 316 |
+
|
| 317 |
+
for entry in entries:
|
| 318 |
+
# Assign numeric ID to each true meaning
|
| 319 |
+
if entry.true_meaning not in meaning_to_id:
|
| 320 |
+
meaning_to_id[entry.true_meaning] = len(meaning_to_id)
|
| 321 |
+
true_labels.append(meaning_to_id[entry.true_meaning])
|
| 322 |
+
|
| 323 |
+
# Find which cluster this entry's text belongs to
|
| 324 |
+
best_cluster = -1
|
| 325 |
+
best_sim = -1
|
| 326 |
+
entry_vec = self.engine.model.encode(
|
| 327 |
+
[entry.text], normalize_embeddings=True, convert_to_numpy=True
|
| 328 |
+
)
|
| 329 |
+
for cluster in analysis.meaning_clusters:
|
| 330 |
+
for ctx in cluster["contexts"]:
|
| 331 |
+
idx = self.engine.chunks.index(ctx.chunk)
|
| 332 |
+
sim = float(np.dot(entry_vec[0], self.engine.embeddings[idx]))
|
| 333 |
+
if sim > best_sim:
|
| 334 |
+
best_sim = sim
|
| 335 |
+
best_cluster = cluster["cluster_id"]
|
| 336 |
+
pred_labels.append(best_cluster)
|
| 337 |
+
|
| 338 |
+
metrics = ClusteringMetrics(
|
| 339 |
+
keyword=keyword,
|
| 340 |
+
nmi=normalized_mutual_info_score(true_labels, pred_labels),
|
| 341 |
+
ari=adjusted_rand_score(true_labels, pred_labels),
|
| 342 |
+
num_predicted_clusters=len(analysis.meaning_clusters),
|
| 343 |
+
num_true_clusters=len(meaning_to_id),
|
| 344 |
+
cluster_sizes=[c["size"] for c in analysis.meaning_clusters],
|
| 345 |
+
)
|
| 346 |
+
all_metrics.append(metrics)
|
| 347 |
+
|
| 348 |
+
elapsed = time.time() - t0
|
| 349 |
+
self._report.clustering_metrics = all_metrics
|
| 350 |
+
self._report.timing["clustering_eval_seconds"] = round(elapsed, 3)
|
| 351 |
+
return all_metrics
|
| 352 |
+
|
| 353 |
+
# ------------------------------------------------------------------ #
|
| 354 |
+
# Disambiguation evaluation
|
| 355 |
+
# ------------------------------------------------------------------ #
|
| 356 |
+
|
| 357 |
+
def evaluate_disambiguation(
|
| 358 |
+
self,
|
| 359 |
+
ground_truth: list[GroundTruthEntry],
|
| 360 |
+
candidate_meanings: dict[str, list[str]],
|
| 361 |
+
) -> list[DisambiguationMetrics]:
|
| 362 |
+
"""
|
| 363 |
+
Evaluate keyword meaning disambiguation accuracy.
|
| 364 |
+
|
| 365 |
+
For each ground truth entry, uses match_keyword_to_meaning() and compares
|
| 366 |
+
the predicted best match against the true label.
|
| 367 |
+
|
| 368 |
+
Args:
|
| 369 |
+
ground_truth: Labeled entries with keyword, text, and true_meaning.
|
| 370 |
+
candidate_meanings: Dict mapping keyword -> list of candidate meaning strings.
|
| 371 |
+
Each candidate should be a descriptive phrase, e.g. {"pizza": ["food", "school"]}.
|
| 372 |
+
|
| 373 |
+
Returns:
|
| 374 |
+
List of DisambiguationMetrics, one per keyword.
|
| 375 |
+
"""
|
| 376 |
+
t0 = time.time()
|
| 377 |
+
|
| 378 |
+
by_keyword: dict[str, list[GroundTruthEntry]] = {}
|
| 379 |
+
for entry in ground_truth:
|
| 380 |
+
by_keyword.setdefault(entry.keyword, []).append(entry)
|
| 381 |
+
|
| 382 |
+
all_metrics = []
|
| 383 |
+
for keyword, entries in by_keyword.items():
|
| 384 |
+
candidates = candidate_meanings.get(keyword, [])
|
| 385 |
+
if not candidates:
|
| 386 |
+
logger.warning(f"No candidate meanings for '{keyword}', skipping.")
|
| 387 |
+
continue
|
| 388 |
+
|
| 389 |
+
true_labels = []
|
| 390 |
+
pred_labels = []
|
| 391 |
+
|
| 392 |
+
for entry in entries:
|
| 393 |
+
# Encode the entry text and score against each candidate
|
| 394 |
+
entry_vec = self.engine.model.encode(
|
| 395 |
+
[entry.text], normalize_embeddings=True, convert_to_tensor=True
|
| 396 |
+
)
|
| 397 |
+
cand_vecs = self.engine.model.encode(
|
| 398 |
+
candidates, normalize_embeddings=True, convert_to_tensor=True
|
| 399 |
+
)
|
| 400 |
+
from sentence_transformers import util as st_util
|
| 401 |
+
scores = st_util.pytorch_cos_sim(entry_vec, cand_vecs)[0]
|
| 402 |
+
best_idx = int(scores.argmax())
|
| 403 |
+
predicted = candidates[best_idx]
|
| 404 |
+
|
| 405 |
+
true_labels.append(entry.true_meaning)
|
| 406 |
+
pred_labels.append(predicted)
|
| 407 |
+
|
| 408 |
+
# Compute metrics
|
| 409 |
+
unique_labels = sorted(set(true_labels + pred_labels))
|
| 410 |
+
accuracy = sum(t == p for t, p in zip(true_labels, pred_labels)) / len(true_labels)
|
| 411 |
+
|
| 412 |
+
# Per-meaning precision, recall, F1
|
| 413 |
+
per_meaning_p = {}
|
| 414 |
+
per_meaning_r = {}
|
| 415 |
+
per_meaning_f = {}
|
| 416 |
+
for label in unique_labels:
|
| 417 |
+
t_binary = [1 if t == label else 0 for t in true_labels]
|
| 418 |
+
p_binary = [1 if p == label else 0 for p in pred_labels]
|
| 419 |
+
p_val = precision_score(t_binary, p_binary, zero_division=0)
|
| 420 |
+
r_val = recall_score(t_binary, p_binary, zero_division=0)
|
| 421 |
+
f_val = f1_score(t_binary, p_binary, zero_division=0)
|
| 422 |
+
per_meaning_p[label] = round(p_val, 4)
|
| 423 |
+
per_meaning_r[label] = round(r_val, 4)
|
| 424 |
+
per_meaning_f[label] = round(f_val, 4)
|
| 425 |
+
|
| 426 |
+
weighted_f = f1_score(
|
| 427 |
+
true_labels, pred_labels, average="weighted", zero_division=0
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
cm = confusion_matrix(true_labels, pred_labels, labels=unique_labels)
|
| 431 |
+
|
| 432 |
+
metrics = DisambiguationMetrics(
|
| 433 |
+
keyword=keyword,
|
| 434 |
+
accuracy=round(accuracy, 4),
|
| 435 |
+
weighted_f1=round(weighted_f, 4),
|
| 436 |
+
per_meaning_precision=per_meaning_p,
|
| 437 |
+
per_meaning_recall=per_meaning_r,
|
| 438 |
+
per_meaning_f1=per_meaning_f,
|
| 439 |
+
confusion=cm.tolist(),
|
| 440 |
+
total_samples=len(entries),
|
| 441 |
+
)
|
| 442 |
+
all_metrics.append(metrics)
|
| 443 |
+
|
| 444 |
+
elapsed = time.time() - t0
|
| 445 |
+
self._report.disambiguation_metrics = all_metrics
|
| 446 |
+
self._report.timing["disambiguation_eval_seconds"] = round(elapsed, 3)
|
| 447 |
+
return all_metrics
|
| 448 |
+
|
| 449 |
+
# ------------------------------------------------------------------ #
|
| 450 |
+
# Similarity distribution analysis
|
| 451 |
+
# ------------------------------------------------------------------ #
|
| 452 |
+
|
| 453 |
+
def analyze_similarity_distribution(
|
| 454 |
+
self, sample_size: int = 1000, seed: int = 42
|
| 455 |
+
) -> dict:
|
| 456 |
+
"""
|
| 457 |
+
Analyze the distribution of pairwise similarities in the corpus.
|
| 458 |
+
Useful for calibrating thresholds and understanding embedding space.
|
| 459 |
+
|
| 460 |
+
Returns:
|
| 461 |
+
Dict with mean, std, percentiles, and histogram data.
|
| 462 |
+
"""
|
| 463 |
+
self.engine._ensure_index()
|
| 464 |
+
n = len(self.engine.chunks)
|
| 465 |
+
rng = np.random.RandomState(seed)
|
| 466 |
+
|
| 467 |
+
# Sample random pairs
|
| 468 |
+
actual_sample = min(sample_size, n * (n - 1) // 2)
|
| 469 |
+
pairs_i = rng.randint(0, n, size=actual_sample)
|
| 470 |
+
pairs_j = rng.randint(0, n, size=actual_sample)
|
| 471 |
+
# Avoid self-pairs
|
| 472 |
+
mask = pairs_i != pairs_j
|
| 473 |
+
pairs_i, pairs_j = pairs_i[mask], pairs_j[mask]
|
| 474 |
+
|
| 475 |
+
sims = np.sum(
|
| 476 |
+
self.engine.embeddings[pairs_i] * self.engine.embeddings[pairs_j], axis=1
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
percentiles = {
|
| 480 |
+
str(p): round(float(np.percentile(sims, p)), 4)
|
| 481 |
+
for p in [5, 10, 25, 50, 75, 90, 95]
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
# Histogram
|
| 485 |
+
hist, bin_edges = np.histogram(sims, bins=20, range=(-1, 1))
|
| 486 |
+
histogram = [
|
| 487 |
+
{"bin_start": round(float(bin_edges[i]), 3), "bin_end": round(float(bin_edges[i + 1]), 3), "count": int(hist[i])}
|
| 488 |
+
for i in range(len(hist))
|
| 489 |
+
]
|
| 490 |
+
|
| 491 |
+
dist_info = {
|
| 492 |
+
"sample_size": int(len(sims)),
|
| 493 |
+
"mean": round(float(np.mean(sims)), 4),
|
| 494 |
+
"std": round(float(np.std(sims)), 4),
|
| 495 |
+
"min": round(float(np.min(sims)), 4),
|
| 496 |
+
"max": round(float(np.max(sims)), 4),
|
| 497 |
+
"percentiles": percentiles,
|
| 498 |
+
"histogram": histogram,
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
self._report.similarity_distribution = dist_info
|
| 502 |
+
return dist_info
|
| 503 |
+
|
| 504 |
+
# ------------------------------------------------------------------ #
|
| 505 |
+
# Full evaluation
|
| 506 |
+
# ------------------------------------------------------------------ #
|
| 507 |
+
|
| 508 |
+
def run_full_evaluation(
|
| 509 |
+
self,
|
| 510 |
+
ground_truth: Optional[list[GroundTruthEntry]] = None,
|
| 511 |
+
candidate_meanings: Optional[dict[str, list[str]]] = None,
|
| 512 |
+
retrieval_queries: Optional[list[dict]] = None,
|
| 513 |
+
cluster_threshold: float = 0.35,
|
| 514 |
+
) -> EvaluationReport:
|
| 515 |
+
"""
|
| 516 |
+
Run the complete evaluation pipeline.
|
| 517 |
+
|
| 518 |
+
Args:
|
| 519 |
+
ground_truth: Labeled data for clustering and disambiguation eval.
|
| 520 |
+
candidate_meanings: Keyword -> candidate meanings for disambiguation.
|
| 521 |
+
retrieval_queries: Labeled queries for retrieval eval.
|
| 522 |
+
cluster_threshold: Clustering distance threshold.
|
| 523 |
+
|
| 524 |
+
Returns:
|
| 525 |
+
Full EvaluationReport.
|
| 526 |
+
"""
|
| 527 |
+
logger.info("Running full evaluation pipeline...")
|
| 528 |
+
t0 = time.time()
|
| 529 |
+
|
| 530 |
+
# Always compute similarity distribution
|
| 531 |
+
self.analyze_similarity_distribution()
|
| 532 |
+
|
| 533 |
+
if retrieval_queries:
|
| 534 |
+
self.evaluate_retrieval(retrieval_queries)
|
| 535 |
+
|
| 536 |
+
if ground_truth:
|
| 537 |
+
self.evaluate_clustering(ground_truth, cluster_threshold)
|
| 538 |
+
if candidate_meanings:
|
| 539 |
+
self.evaluate_disambiguation(ground_truth, candidate_meanings)
|
| 540 |
+
|
| 541 |
+
self._report.timing["total_eval_seconds"] = round(time.time() - t0, 3)
|
| 542 |
+
logger.info("Evaluation complete.")
|
| 543 |
+
return self._report
|
| 544 |
+
|
| 545 |
+
def get_report(self) -> EvaluationReport:
|
| 546 |
+
"""Return the current evaluation report."""
|
| 547 |
+
return self._report
|
frontend/.gitignore
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logs
|
| 2 |
+
logs
|
| 3 |
+
*.log
|
| 4 |
+
npm-debug.log*
|
| 5 |
+
yarn-debug.log*
|
| 6 |
+
yarn-error.log*
|
| 7 |
+
pnpm-debug.log*
|
| 8 |
+
lerna-debug.log*
|
| 9 |
+
|
| 10 |
+
node_modules
|
| 11 |
+
dist
|
| 12 |
+
dist-ssr
|
| 13 |
+
*.local
|
| 14 |
+
|
| 15 |
+
# Editor directories and files
|
| 16 |
+
.vscode/*
|
| 17 |
+
!.vscode/extensions.json
|
| 18 |
+
.idea
|
| 19 |
+
.DS_Store
|
| 20 |
+
*.suo
|
| 21 |
+
*.ntvs*
|
| 22 |
+
*.njsproj
|
| 23 |
+
*.sln
|
| 24 |
+
*.sw?
|
frontend/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# React + Vite
|
| 2 |
+
|
| 3 |
+
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
| 4 |
+
|
| 5 |
+
Currently, two official plugins are available:
|
| 6 |
+
|
| 7 |
+
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) (or [oxc](https://oxc.rs) when used in [rolldown-vite](https://vite.dev/guide/rolldown)) for Fast Refresh
|
| 8 |
+
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
|
| 9 |
+
|
| 10 |
+
## React Compiler
|
| 11 |
+
|
| 12 |
+
The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
|
| 13 |
+
|
| 14 |
+
## Expanding the ESLint configuration
|
| 15 |
+
|
| 16 |
+
If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.
|
frontend/eslint.config.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import js from '@eslint/js'
|
| 2 |
+
import globals from 'globals'
|
| 3 |
+
import reactHooks from 'eslint-plugin-react-hooks'
|
| 4 |
+
import reactRefresh from 'eslint-plugin-react-refresh'
|
| 5 |
+
import { defineConfig, globalIgnores } from 'eslint/config'
|
| 6 |
+
|
| 7 |
+
export default defineConfig([
|
| 8 |
+
globalIgnores(['dist']),
|
| 9 |
+
{
|
| 10 |
+
files: ['**/*.{js,jsx}'],
|
| 11 |
+
extends: [
|
| 12 |
+
js.configs.recommended,
|
| 13 |
+
reactHooks.configs.flat.recommended,
|
| 14 |
+
reactRefresh.configs.vite,
|
| 15 |
+
],
|
| 16 |
+
languageOptions: {
|
| 17 |
+
ecmaVersion: 2020,
|
| 18 |
+
globals: globals.browser,
|
| 19 |
+
parserOptions: {
|
| 20 |
+
ecmaVersion: 'latest',
|
| 21 |
+
ecmaFeatures: { jsx: true },
|
| 22 |
+
sourceType: 'module',
|
| 23 |
+
},
|
| 24 |
+
},
|
| 25 |
+
rules: {
|
| 26 |
+
'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],
|
| 27 |
+
},
|
| 28 |
+
},
|
| 29 |
+
])
|
frontend/index.html
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>Contextual Similarity Engine</title>
|
| 7 |
+
</head>
|
| 8 |
+
<body>
|
| 9 |
+
<div id="root"></div>
|
| 10 |
+
<script type="module" src="/src/main.tsx"></script>
|
| 11 |
+
</body>
|
| 12 |
+
</html>
|
frontend/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "contextual-similarity-ui",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "1.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "tsc -b && vite build",
|
| 9 |
+
"lint": "eslint .",
|
| 10 |
+
"preview": "vite preview"
|
| 11 |
+
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"axios": "^1.13.6",
|
| 14 |
+
"react": "^19.2.4",
|
| 15 |
+
"react-dom": "^19.2.4",
|
| 16 |
+
"recharts": "^3.8.0"
|
| 17 |
+
},
|
| 18 |
+
"devDependencies": {
|
| 19 |
+
"@eslint/js": "^9.39.4",
|
| 20 |
+
"@types/react": "^19.2.14",
|
| 21 |
+
"@types/react-dom": "^19.2.3",
|
| 22 |
+
"@vitejs/plugin-react": "^5.1.4",
|
| 23 |
+
"eslint": "^9.39.4",
|
| 24 |
+
"eslint-plugin-react-hooks": "^7.0.1",
|
| 25 |
+
"eslint-plugin-react-refresh": "^0.5.2",
|
| 26 |
+
"globals": "^17.4.0",
|
| 27 |
+
"typescript": "~5.9.3",
|
| 28 |
+
"vite": "^7.3.1"
|
| 29 |
+
}
|
| 30 |
+
}
|
frontend/public/vite.svg
ADDED
|
|
frontend/src/App.tsx
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useEffect, Fragment } from "react";
|
| 2 |
+
import type { CorpusStats } from "./types";
|
| 3 |
+
import { api, checkConnection } from "./api";
|
| 4 |
+
import TrainingPanel from "./components/TrainingPanel";
|
| 5 |
+
import EngineSetup from "./components/EngineSetup";
|
| 6 |
+
import SemanticSearch from "./components/SemanticSearch";
|
| 7 |
+
import TextCompare from "./components/TextCompare";
|
| 8 |
+
import KeywordAnalysis from "./components/KeywordAnalysis";
|
| 9 |
+
import KeywordMatcher from "./components/KeywordMatcher";
|
| 10 |
+
import BatchAnalysis from "./components/BatchAnalysis";
|
| 11 |
+
import SimilarWords from "./components/SimilarWords";
|
| 12 |
+
import ContextAnalysis from "./components/ContextAnalysis";
|
| 13 |
+
import EvaluationDashboard from "./components/EvaluationDashboard";
|
| 14 |
+
import Word2VecPanel from "./components/Word2VecPanel";
|
| 15 |
+
import DatasetPanel from "./components/DatasetPanel";
|
| 16 |
+
import "./styles.css";
|
| 17 |
+
|
| 18 |
+
type NavGroup = "data" | "training" | "analysis" | "evaluation";
|
| 19 |
+
type TrainingTab = "model" | "w2v";
|
| 20 |
+
type AnalysisTab = "context" | "words" | "search" | "compare" | "keyword" | "match" | "batch";
|
| 21 |
+
|
| 22 |
+
const STEPS: { id: NavGroup; label: string; needsIndex?: boolean }[] = [
|
| 23 |
+
{ id: "data", label: "Data & Setup" },
|
| 24 |
+
{ id: "training", label: "Training" },
|
| 25 |
+
{ id: "analysis", label: "Analysis", needsIndex: true },
|
| 26 |
+
{ id: "evaluation", label: "Evaluation", needsIndex: true },
|
| 27 |
+
];
|
| 28 |
+
|
| 29 |
+
const TRAINING_TABS: { id: TrainingTab; label: string }[] = [
|
| 30 |
+
{ id: "model", label: "Fine-tune Model" },
|
| 31 |
+
{ id: "w2v", label: "Word2Vec Baseline" },
|
| 32 |
+
];
|
| 33 |
+
|
| 34 |
+
const ANALYSIS_TABS: { id: AnalysisTab; label: string }[] = [
|
| 35 |
+
{ id: "context", label: "Context" },
|
| 36 |
+
{ id: "words", label: "Similar Words" },
|
| 37 |
+
{ id: "search", label: "Search" },
|
| 38 |
+
{ id: "compare", label: "Compare" },
|
| 39 |
+
{ id: "keyword", label: "Keywords" },
|
| 40 |
+
{ id: "match", label: "Matcher" },
|
| 41 |
+
{ id: "batch", label: "Batch" },
|
| 42 |
+
];
|
| 43 |
+
|
| 44 |
+
export default function App() {
|
| 45 |
+
const [group, setGroup] = useState<NavGroup>("data");
|
| 46 |
+
const [trainingTab, setTrainingTab] = useState<TrainingTab>("model");
|
| 47 |
+
const [analysisTab, setAnalysisTab] = useState<AnalysisTab>("context");
|
| 48 |
+
const [stats, setStats] = useState<CorpusStats | null>(null);
|
| 49 |
+
const [showManualSetup, setShowManualSetup] = useState(false);
|
| 50 |
+
const [serverError, setServerError] = useState<string | null>(null);
|
| 51 |
+
const ready = stats !== null && stats.index_built;
|
| 52 |
+
|
| 53 |
+
useEffect(() => {
|
| 54 |
+
checkConnection().then((err) => {
|
| 55 |
+
setServerError(err);
|
| 56 |
+
// If server is up, try to fetch stats (engine may have been auto-restored)
|
| 57 |
+
if (!err) {
|
| 58 |
+
api.getStats().then(setStats).catch(() => {});
|
| 59 |
+
}
|
| 60 |
+
});
|
| 61 |
+
const interval = setInterval(() => {
|
| 62 |
+
checkConnection().then(setServerError);
|
| 63 |
+
}, 15000);
|
| 64 |
+
return () => clearInterval(interval);
|
| 65 |
+
}, []);
|
| 66 |
+
|
| 67 |
+
function handleStepClick(id: NavGroup, needsIndex?: boolean) {
|
| 68 |
+
if (needsIndex && !ready) return;
|
| 69 |
+
setGroup(id);
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return (
|
| 73 |
+
<div className="app">
|
| 74 |
+
<header className="app-header">
|
| 75 |
+
<h1>Contextual Similarity Engine</h1>
|
| 76 |
+
{stats && (
|
| 77 |
+
<div className="header-stats">
|
| 78 |
+
<span className="badge">{stats.model_name}</span>
|
| 79 |
+
<span className="badge">{stats.total_documents} docs</span>
|
| 80 |
+
<span className="badge">{stats.total_chunks} chunks</span>
|
| 81 |
+
<span className={`badge ${stats.index_built ? "badge-ok" : "badge-warn"}`}>
|
| 82 |
+
{stats.index_built ? "Index ready" : "Index not built"}
|
| 83 |
+
</span>
|
| 84 |
+
</div>
|
| 85 |
+
)}
|
| 86 |
+
</header>
|
| 87 |
+
|
| 88 |
+
{serverError && (
|
| 89 |
+
<div className="server-error-banner">
|
| 90 |
+
<strong>Server unavailable:</strong> {serverError}
|
| 91 |
+
</div>
|
| 92 |
+
)}
|
| 93 |
+
|
| 94 |
+
{/* Progress Stepper (serves as main navigation) */}
|
| 95 |
+
<nav className="stepper">
|
| 96 |
+
{STEPS.map((step, i) => {
|
| 97 |
+
const disabled = step.needsIndex && !ready;
|
| 98 |
+
const active = group === step.id;
|
| 99 |
+
const done = step.id === "data" && ready;
|
| 100 |
+
return (
|
| 101 |
+
<Fragment key={step.id}>
|
| 102 |
+
{i > 0 && (
|
| 103 |
+
<div className={`stepper-line ${!disabled ? "stepper-line-active" : ""}`} />
|
| 104 |
+
)}
|
| 105 |
+
<div className="stepper-item">
|
| 106 |
+
<button
|
| 107 |
+
className={`stepper-circle ${active ? "stepper-active" : ""} ${done && !active ? "stepper-done" : ""}`}
|
| 108 |
+
onClick={() => handleStepClick(step.id, step.needsIndex)}
|
| 109 |
+
disabled={disabled}
|
| 110 |
+
>
|
| 111 |
+
{done && !active ? "\u2713" : i + 1}
|
| 112 |
+
</button>
|
| 113 |
+
<span className={`stepper-label ${active ? "stepper-label-active" : ""}`}>
|
| 114 |
+
{step.label}
|
| 115 |
+
</span>
|
| 116 |
+
</div>
|
| 117 |
+
</Fragment>
|
| 118 |
+
);
|
| 119 |
+
})}
|
| 120 |
+
</nav>
|
| 121 |
+
|
| 122 |
+
{/* Sub-tabs for groups with multiple views */}
|
| 123 |
+
{group === "training" && (
|
| 124 |
+
<nav className="subtabs">
|
| 125 |
+
{TRAINING_TABS.map((t) => (
|
| 126 |
+
<button
|
| 127 |
+
key={t.id}
|
| 128 |
+
className={`subtab ${trainingTab === t.id ? "subtab-active" : ""}`}
|
| 129 |
+
onClick={() => setTrainingTab(t.id)}
|
| 130 |
+
>
|
| 131 |
+
{t.label}
|
| 132 |
+
</button>
|
| 133 |
+
))}
|
| 134 |
+
</nav>
|
| 135 |
+
)}
|
| 136 |
+
|
| 137 |
+
{group === "analysis" && (
|
| 138 |
+
<nav className="subtabs">
|
| 139 |
+
{ANALYSIS_TABS.map((t) => (
|
| 140 |
+
<button
|
| 141 |
+
key={t.id}
|
| 142 |
+
className={`subtab ${analysisTab === t.id ? "subtab-active" : ""}`}
|
| 143 |
+
onClick={() => setAnalysisTab(t.id)}
|
| 144 |
+
>
|
| 145 |
+
{t.label}
|
| 146 |
+
</button>
|
| 147 |
+
))}
|
| 148 |
+
</nav>
|
| 149 |
+
)}
|
| 150 |
+
|
| 151 |
+
{/* Content */}
|
| 152 |
+
<main className="content">
|
| 153 |
+
{group === "data" && (
|
| 154 |
+
<>
|
| 155 |
+
<DatasetPanel onStatsUpdate={setStats} />
|
| 156 |
+
<button
|
| 157 |
+
className="collapsible-toggle"
|
| 158 |
+
onClick={() => setShowManualSetup(!showManualSetup)}
|
| 159 |
+
>
|
| 160 |
+
<span className="collapsible-arrow">{showManualSetup ? "\u25be" : "\u25b8"}</span>
|
| 161 |
+
Or add documents manually
|
| 162 |
+
</button>
|
| 163 |
+
{showManualSetup && <EngineSetup onStatsUpdate={setStats} />}
|
| 164 |
+
</>
|
| 165 |
+
)}
|
| 166 |
+
|
| 167 |
+
{group === "training" && trainingTab === "model" && <TrainingPanel />}
|
| 168 |
+
{group === "training" && trainingTab === "w2v" && <Word2VecPanel />}
|
| 169 |
+
|
| 170 |
+
{group === "analysis" && analysisTab === "context" && <ContextAnalysis />}
|
| 171 |
+
{group === "analysis" && analysisTab === "words" && <SimilarWords />}
|
| 172 |
+
{group === "analysis" && analysisTab === "search" && <SemanticSearch />}
|
| 173 |
+
{group === "analysis" && analysisTab === "compare" && <TextCompare />}
|
| 174 |
+
{group === "analysis" && analysisTab === "keyword" && <KeywordAnalysis />}
|
| 175 |
+
{group === "analysis" && analysisTab === "match" && <KeywordMatcher />}
|
| 176 |
+
{group === "analysis" && analysisTab === "batch" && <BatchAnalysis />}
|
| 177 |
+
|
| 178 |
+
{group === "evaluation" && <EvaluationDashboard />}
|
| 179 |
+
</main>
|
| 180 |
+
</div>
|
| 181 |
+
);
|
| 182 |
+
}
|
frontend/src/api.ts
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import axios from "axios";
|
| 2 |
+
import type {
|
| 3 |
+
InitRequest, InitResponse, DocumentRequest, AddDocResponse, BuildIndexResponse,
|
| 4 |
+
QueryRequest, QueryResponse, CompareRequest, CompareResponse,
|
| 5 |
+
KeywordAnalysisRequest, KeywordAnalysisResponse,
|
| 6 |
+
KeywordMatchRequest, MatchResponse, BatchAnalysisRequest,
|
| 7 |
+
CorpusStats, SimilarityDistribution, DisambiguationMetric, RetrievalMetric,
|
| 8 |
+
TrainResponse, TrainEvalResponse,
|
| 9 |
+
W2VInitResponse, W2VQueryResult, W2VSimilarWord,
|
| 10 |
+
DatasetInfo, DatasetLoadRequest, DatasetLoadResponse, DatasetPreviewResponse,
|
| 11 |
+
ContextAnalysisResponse,
|
| 12 |
+
} from "./types";
|
| 13 |
+
|
| 14 |
+
const client = axios.create({ baseURL: "/api" });
|
| 15 |
+
const long = { timeout: 600000 };
|
| 16 |
+
|
| 17 |
+
/** Extract a human-readable error message from an Axios error. */
|
| 18 |
+
export function getErrorMessage(err: unknown): string {
|
| 19 |
+
if (axios.isAxiosError(err)) {
|
| 20 |
+
if (err.code === "ECONNABORTED") return "Request timed out. The server may be busy.";
|
| 21 |
+
if (!err.response) return "Cannot connect to server. Is it running? (uv run python server.py)";
|
| 22 |
+
const detail = err.response.data?.detail;
|
| 23 |
+
if (typeof detail === "string") return detail;
|
| 24 |
+
if (typeof err.response.data === "string") return err.response.data;
|
| 25 |
+
return `Server error (${err.response.status})`;
|
| 26 |
+
}
|
| 27 |
+
if (err instanceof Error) return err.message;
|
| 28 |
+
return "An unexpected error occurred.";
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/** Check if the backend is reachable. Returns null on success or an error message. */
|
| 32 |
+
export async function checkConnection(): Promise<string | null> {
|
| 33 |
+
try {
|
| 34 |
+
await client.get("/stats", { timeout: 5000 });
|
| 35 |
+
return null;
|
| 36 |
+
} catch (err) {
|
| 37 |
+
if (axios.isAxiosError(err) && err.response?.status === 400) {
|
| 38 |
+
// 400 = "Engine not initialized" — server is up, just no engine yet
|
| 39 |
+
return null;
|
| 40 |
+
}
|
| 41 |
+
return getErrorMessage(err);
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
/** Shared shape for all training requests (matches server TrainRequest). */
|
| 46 |
+
interface TrainRequestData {
|
| 47 |
+
corpus_texts: string[];
|
| 48 |
+
base_model: string;
|
| 49 |
+
output_path: string;
|
| 50 |
+
epochs: number;
|
| 51 |
+
batch_size: number;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
export const api = {
|
| 55 |
+
// ---- Training ----
|
| 56 |
+
trainUnsupervised: (data: TrainRequestData) =>
|
| 57 |
+
client.post<TrainResponse>("/train/unsupervised", data, long).then(r => r.data),
|
| 58 |
+
|
| 59 |
+
trainContrastive: (data: TrainRequestData) =>
|
| 60 |
+
client.post<TrainResponse>("/train/contrastive", data, long).then(r => r.data),
|
| 61 |
+
|
| 62 |
+
trainKeywords: (data: TrainRequestData & { keyword_meanings: Record<string, string> }) =>
|
| 63 |
+
client.post<TrainResponse>("/train/keywords", data, long).then(r => r.data),
|
| 64 |
+
|
| 65 |
+
trainEvaluate: (data: { test_pairs: { text_a: string; text_b: string; expected: number }[]; trained_model_path: string; base_model: string; corpus_texts: string[] }) =>
|
| 66 |
+
client.post<TrainEvalResponse>("/train/evaluate", data).then(r => r.data),
|
| 67 |
+
|
| 68 |
+
// ---- Engine ----
|
| 69 |
+
init: (data: InitRequest) =>
|
| 70 |
+
client.post<InitResponse>("/init", data).then(r => r.data),
|
| 71 |
+
|
| 72 |
+
addDocument: (data: DocumentRequest) =>
|
| 73 |
+
client.post<AddDocResponse>("/documents", data).then(r => r.data),
|
| 74 |
+
|
| 75 |
+
buildIndex: () =>
|
| 76 |
+
client.post<BuildIndexResponse>("/index/build").then(r => r.data),
|
| 77 |
+
|
| 78 |
+
query: (data: QueryRequest) =>
|
| 79 |
+
client.post<QueryResponse>("/query", data).then(r => r.data),
|
| 80 |
+
|
| 81 |
+
compare: (data: CompareRequest) =>
|
| 82 |
+
client.post<CompareResponse>("/compare", data).then(r => r.data),
|
| 83 |
+
|
| 84 |
+
analyzeKeyword: (data: KeywordAnalysisRequest) =>
|
| 85 |
+
client.post<KeywordAnalysisResponse>("/analyze/keyword", data).then(r => r.data),
|
| 86 |
+
|
| 87 |
+
batchAnalyze: (data: BatchAnalysisRequest) =>
|
| 88 |
+
client.post<Record<string, KeywordAnalysisResponse>>("/analyze/batch", data).then(r => r.data),
|
| 89 |
+
|
| 90 |
+
matchKeyword: (data: KeywordMatchRequest) =>
|
| 91 |
+
client.post<MatchResponse>("/match", data).then(r => r.data),
|
| 92 |
+
|
| 93 |
+
analyzeContext: (data: { keyword: string; cluster_threshold?: number; top_words?: number }) =>
|
| 94 |
+
client.post<ContextAnalysisResponse>("/analyze/context", data).then(r => r.data),
|
| 95 |
+
|
| 96 |
+
similarWords: (data: { word: string; top_k: number }) =>
|
| 97 |
+
client.post<{ word: string; similar: { word: string; score: number }[] }>("/analyze/similar-words", data).then(r => r.data),
|
| 98 |
+
|
| 99 |
+
getStats: () =>
|
| 100 |
+
client.get<CorpusStats>("/stats").then(r => r.data),
|
| 101 |
+
|
| 102 |
+
getCorpusTexts: (maxDocs: number = 500) =>
|
| 103 |
+
client.get<{ documents: { doc_id: string; text: string }[]; count: number }>(`/corpus/texts?max_docs=${maxDocs}`).then(r => r.data),
|
| 104 |
+
|
| 105 |
+
// ---- Engine persistence ----
|
| 106 |
+
saveEngine: () =>
|
| 107 |
+
client.post<{ status: string; chunks: number; documents: number }>("/engine/save").then(r => r.data),
|
| 108 |
+
|
| 109 |
+
hasSavedState: () =>
|
| 110 |
+
client.get<{ exists: boolean }>("/engine/has-saved-state").then(r => r.data),
|
| 111 |
+
|
| 112 |
+
// ---- Evaluation ----
|
| 113 |
+
getSimilarityDistribution: () =>
|
| 114 |
+
client.get<SimilarityDistribution>("/eval/similarity-distribution").then(r => r.data),
|
| 115 |
+
|
| 116 |
+
evalDisambiguation: (data: { ground_truth: { keyword: string; text: string; true_meaning: string }[]; candidate_meanings: Record<string, string[]> }) =>
|
| 117 |
+
client.post<{ metrics: DisambiguationMetric[] }>("/eval/disambiguation", data).then(r => r.data),
|
| 118 |
+
|
| 119 |
+
evalRetrieval: (data: { queries: { query: string; relevant_doc_ids?: string[]; relevant_texts?: string[] }[]; k_values: number[] }) =>
|
| 120 |
+
client.post<{ metrics: RetrievalMetric[] }>("/eval/retrieval", data).then(r => r.data),
|
| 121 |
+
|
| 122 |
+
// ---- Word2Vec ----
|
| 123 |
+
w2vInit: (data: { corpus_texts: string[]; vector_size: number; window: number; epochs: number }) =>
|
| 124 |
+
client.post<W2VInitResponse>("/w2v/init", data, long).then(r => r.data),
|
| 125 |
+
|
| 126 |
+
w2vCompare: (data: { text_a: string; text_b: string }) =>
|
| 127 |
+
client.post<CompareResponse>("/w2v/compare", data).then(r => r.data),
|
| 128 |
+
|
| 129 |
+
w2vQuery: (data: { text: string; top_k: number }) =>
|
| 130 |
+
client.post<{ query: string; results: W2VQueryResult[] }>("/w2v/query", data).then(r => r.data),
|
| 131 |
+
|
| 132 |
+
w2vSimilarWords: (data: { word: string; top_k: number }) =>
|
| 133 |
+
client.post<{ word: string; similar: W2VSimilarWord[] }>("/w2v/similar-words", data).then(r => r.data),
|
| 134 |
+
|
| 135 |
+
// ---- Dataset (HuggingFace) ----
|
| 136 |
+
datasetInfo: () =>
|
| 137 |
+
client.get<DatasetInfo>("/dataset/info").then(r => r.data),
|
| 138 |
+
|
| 139 |
+
datasetLoad: (data: DatasetLoadRequest) =>
|
| 140 |
+
client.post<DatasetLoadResponse>("/dataset/load", data, long).then(r => r.data),
|
| 141 |
+
|
| 142 |
+
datasetPreview: (maxDocs: number = 10, sourceFilter?: string) =>
|
| 143 |
+
client.post<DatasetPreviewResponse>(`/dataset/preview?max_docs=${maxDocs}${sourceFilter ? `&source_filter=${sourceFilter}` : ""}`).then(r => r.data),
|
| 144 |
+
};
|
frontend/src/assets/react.svg
ADDED
|
|
frontend/src/components/BatchAnalysis.tsx
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api } from "../api";
|
| 3 |
+
import type { KeywordAnalysisResponse } from "../types";
|
| 4 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
+
import ScoreBar from "./ScoreBar";
|
| 6 |
+
import StatusMessage from "./StatusMessage";
|
| 7 |
+
|
| 8 |
+
export default function BatchAnalysis() {
|
| 9 |
+
const [keywordsText, setKeywordsText] = useState("");
|
| 10 |
+
const [topK, setTopK] = useState(5);
|
| 11 |
+
const [threshold, setThreshold] = useState(0.4);
|
| 12 |
+
const { data: results, loading, error, run } = useApiCall<Record<string, KeywordAnalysisResponse>>();
|
| 13 |
+
|
| 14 |
+
async function handleAnalyze() {
|
| 15 |
+
const keywords = keywordsText.split("\n").map((s) => s.trim()).filter(Boolean);
|
| 16 |
+
if (keywords.length === 0) return;
|
| 17 |
+
await run(() => api.batchAnalyze({ keywords, top_k: topK, cluster_threshold: threshold, compare_across: true }));
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
return (
|
| 21 |
+
<div>
|
| 22 |
+
<div className="panel">
|
| 23 |
+
<h2>Batch Keyword Analysis</h2>
|
| 24 |
+
<p className="panel-desc">
|
| 25 |
+
Analyze multiple keywords at once and compare their semantic relationships.
|
| 26 |
+
</p>
|
| 27 |
+
<div className="form-row">
|
| 28 |
+
<div className="form-group">
|
| 29 |
+
<label>Keywords (one per line)</label>
|
| 30 |
+
<textarea
|
| 31 |
+
value={keywordsText}
|
| 32 |
+
onChange={(e) => setKeywordsText(e.target.value)}
|
| 33 |
+
placeholder={`pizza\nschool\nhomework`}
|
| 34 |
+
rows={4}
|
| 35 |
+
/>
|
| 36 |
+
</div>
|
| 37 |
+
<div className="flex-col gap-1">
|
| 38 |
+
<div className="form-group form-group-sm">
|
| 39 |
+
<label>Top K</label>
|
| 40 |
+
<input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
|
| 41 |
+
</div>
|
| 42 |
+
<div className="form-group form-group-md">
|
| 43 |
+
<label>Cluster Threshold</label>
|
| 44 |
+
<input type="number" value={threshold} onChange={(e) => setThreshold(+e.target.value)} min={0.1} max={1} step={0.05} />
|
| 45 |
+
</div>
|
| 46 |
+
</div>
|
| 47 |
+
</div>
|
| 48 |
+
<button className="btn btn-primary" onClick={handleAnalyze} disabled={loading || !keywordsText.trim()}>
|
| 49 |
+
{loading ? "Analyzing..." : "Analyze All"}
|
| 50 |
+
</button>
|
| 51 |
+
</div>
|
| 52 |
+
|
| 53 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 54 |
+
|
| 55 |
+
{results && (
|
| 56 |
+
<>
|
| 57 |
+
{Object.values(results).some((a) => Object.keys(a.cross_keyword_similarities).length > 0) && (
|
| 58 |
+
<div className="panel">
|
| 59 |
+
<h3>Cross-Keyword Similarity</h3>
|
| 60 |
+
<table className="data-table">
|
| 61 |
+
<thead>
|
| 62 |
+
<tr>
|
| 63 |
+
<th>Keyword</th>
|
| 64 |
+
{Object.keys(results).map((kw) => (
|
| 65 |
+
<th key={kw}>{kw}</th>
|
| 66 |
+
))}
|
| 67 |
+
</tr>
|
| 68 |
+
</thead>
|
| 69 |
+
<tbody>
|
| 70 |
+
{Object.entries(results).map(([kw, analysis]) => (
|
| 71 |
+
<tr key={kw}>
|
| 72 |
+
<td style={{ fontWeight: 600 }}>{kw}</td>
|
| 73 |
+
{Object.keys(results).map((other) => (
|
| 74 |
+
<td key={other}>
|
| 75 |
+
{kw === other ? (
|
| 76 |
+
<span className="text-dim">-</span>
|
| 77 |
+
) : (
|
| 78 |
+
<ScoreBar score={analysis.cross_keyword_similarities[other] ?? 0} />
|
| 79 |
+
)}
|
| 80 |
+
</td>
|
| 81 |
+
))}
|
| 82 |
+
</tr>
|
| 83 |
+
))}
|
| 84 |
+
</tbody>
|
| 85 |
+
</table>
|
| 86 |
+
</div>
|
| 87 |
+
)}
|
| 88 |
+
|
| 89 |
+
{Object.entries(results).map(([kw, analysis]) => (
|
| 90 |
+
<div key={kw} className="panel">
|
| 91 |
+
<h3>
|
| 92 |
+
"{kw}" — {analysis.total_occurrences} occurrence(s),{" "}
|
| 93 |
+
{analysis.meaning_clusters.length} cluster(s)
|
| 94 |
+
</h3>
|
| 95 |
+
{analysis.meaning_clusters.map((cluster) => (
|
| 96 |
+
<div key={cluster.cluster_id} className="result-card mt-1">
|
| 97 |
+
<div className="result-header">
|
| 98 |
+
<strong>Cluster {cluster.cluster_id}</strong>
|
| 99 |
+
<span className="tag">{cluster.size} occurrence(s)</span>
|
| 100 |
+
</div>
|
| 101 |
+
<div className="result-text">{cluster.representative_text.slice(0, 200)}...</div>
|
| 102 |
+
</div>
|
| 103 |
+
))}
|
| 104 |
+
</div>
|
| 105 |
+
))}
|
| 106 |
+
</>
|
| 107 |
+
)}
|
| 108 |
+
</div>
|
| 109 |
+
);
|
| 110 |
+
}
|
frontend/src/components/ContextAnalysis.tsx
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api } from "../api";
|
| 3 |
+
import type { ContextAnalysisResponse } from "../types";
|
| 4 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
+
import StatusMessage from "./StatusMessage";
|
| 6 |
+
|
| 7 |
+
export default function ContextAnalysis() {
|
| 8 |
+
const [keyword, setKeyword] = useState("");
|
| 9 |
+
const { data: result, loading, error, run } = useApiCall<ContextAnalysisResponse>();
|
| 10 |
+
|
| 11 |
+
async function handleAnalyze() {
|
| 12 |
+
if (!keyword.trim()) return;
|
| 13 |
+
await run(() => api.analyzeContext({ keyword: keyword.trim() }));
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
return (
|
| 17 |
+
<div>
|
| 18 |
+
<div className="panel">
|
| 19 |
+
<h2>Context Analysis</h2>
|
| 20 |
+
<p className="panel-desc">
|
| 21 |
+
Enter a keyword to discover what it likely means based on how it's used in the corpus.
|
| 22 |
+
The engine clusters all occurrences and extracts the most associated words for each meaning.
|
| 23 |
+
</p>
|
| 24 |
+
<div className="flex-row" style={{ alignItems: "flex-end" }}>
|
| 25 |
+
<div className="form-group form-group-lg">
|
| 26 |
+
<label>Keyword</label>
|
| 27 |
+
<input
|
| 28 |
+
value={keyword}
|
| 29 |
+
onChange={(e) => setKeyword(e.target.value)}
|
| 30 |
+
onKeyDown={(e) => e.key === "Enter" && handleAnalyze()}
|
| 31 |
+
placeholder="e.g. Epstein, flight, island"
|
| 32 |
+
/>
|
| 33 |
+
</div>
|
| 34 |
+
<button
|
| 35 |
+
className="btn btn-primary"
|
| 36 |
+
onClick={handleAnalyze}
|
| 37 |
+
disabled={loading || !keyword.trim()}
|
| 38 |
+
style={{ height: 38 }}
|
| 39 |
+
>
|
| 40 |
+
{loading ? "Analyzing..." : "Analyze"}
|
| 41 |
+
</button>
|
| 42 |
+
</div>
|
| 43 |
+
</div>
|
| 44 |
+
|
| 45 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 46 |
+
|
| 47 |
+
{result && result.total_occurrences === 0 && (
|
| 48 |
+
<StatusMessage type="err" message={`No occurrences of "${result.keyword}" found in the corpus.`} />
|
| 49 |
+
)}
|
| 50 |
+
|
| 51 |
+
{result && result.meanings.length > 0 && (
|
| 52 |
+
<div className="panel">
|
| 53 |
+
<h2>
|
| 54 |
+
"{result.keyword}" — {result.total_occurrences} occurrences, {result.meanings.length} meaning{result.meanings.length > 1 ? "s" : ""}
|
| 55 |
+
</h2>
|
| 56 |
+
|
| 57 |
+
<div className="flex-col gap-3">
|
| 58 |
+
{result.meanings.map((meaning, idx) => (
|
| 59 |
+
<div key={meaning.cluster_id} className="result-card">
|
| 60 |
+
<div className="result-header">
|
| 61 |
+
<span style={{ fontWeight: 600, fontSize: "0.9rem" }}>
|
| 62 |
+
Meaning {idx + 1}
|
| 63 |
+
</span>
|
| 64 |
+
<div className="flex-row">
|
| 65 |
+
<span className="badge">
|
| 66 |
+
{meaning.occurrences} occurrence{meaning.occurrences > 1 ? "s" : ""}
|
| 67 |
+
</span>
|
| 68 |
+
<span
|
| 69 |
+
className="badge"
|
| 70 |
+
style={{
|
| 71 |
+
background: `rgba(${meaning.confidence > 0.5 ? "74, 222, 128" : "108, 140, 255"}, 0.15)`,
|
| 72 |
+
color: meaning.confidence > 0.5 ? "var(--ok)" : "var(--accent)",
|
| 73 |
+
}}
|
| 74 |
+
>
|
| 75 |
+
{(meaning.confidence * 100).toFixed(1)}%
|
| 76 |
+
</span>
|
| 77 |
+
</div>
|
| 78 |
+
</div>
|
| 79 |
+
|
| 80 |
+
{/* Associated words bar chart */}
|
| 81 |
+
<div className="mt-2">
|
| 82 |
+
{meaning.associated_words.map((aw) => {
|
| 83 |
+
const maxScore = meaning.associated_words[0]?.score || 1;
|
| 84 |
+
const pct = Math.round((aw.score / maxScore) * 100);
|
| 85 |
+
return (
|
| 86 |
+
<div key={aw.word} className="context-bar-row">
|
| 87 |
+
<span className="context-bar-label">{aw.word}</span>
|
| 88 |
+
<div className="context-bar-track">
|
| 89 |
+
<div className="context-bar-fill" style={{ width: `${pct}%` }} />
|
| 90 |
+
</div>
|
| 91 |
+
<span className="context-bar-value">{(aw.score * 100).toFixed(0)}</span>
|
| 92 |
+
</div>
|
| 93 |
+
);
|
| 94 |
+
})}
|
| 95 |
+
</div>
|
| 96 |
+
|
| 97 |
+
{/* Example snippets */}
|
| 98 |
+
{meaning.example_contexts.length > 0 && (
|
| 99 |
+
<div className="mt-2">
|
| 100 |
+
<div className="section-label">Example contexts</div>
|
| 101 |
+
{meaning.example_contexts.map((ex, i) => (
|
| 102 |
+
<div key={i} className="context-snippet">
|
| 103 |
+
<span className="context-snippet-source">{ex.doc_id}</span>
|
| 104 |
+
{ex.snippet}
|
| 105 |
+
</div>
|
| 106 |
+
))}
|
| 107 |
+
</div>
|
| 108 |
+
)}
|
| 109 |
+
</div>
|
| 110 |
+
))}
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
)}
|
| 114 |
+
</div>
|
| 115 |
+
);
|
| 116 |
+
}
|
frontend/src/components/DatasetPanel.tsx
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useEffect } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { DatasetInfo, DatasetLoadResponse, DatasetPreviewDoc } from "../types";
|
| 4 |
+
import StatusMessage from "./StatusMessage";
|
| 5 |
+
import MetricCard from "./MetricCard";
|
| 6 |
+
import Toggle from "./Toggle";
|
| 7 |
+
import Select from "./Select";
|
| 8 |
+
import Switch from "./Switch";
|
| 9 |
+
import LogViewer from "./LogViewer";
|
| 10 |
+
|
| 11 |
+
interface Props {
|
| 12 |
+
onStatsUpdate?: (stats: any) => void;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
export default function DatasetPanel({ onStatsUpdate }: Props) {
|
| 16 |
+
const [info, setInfo] = useState<DatasetInfo | null>(null);
|
| 17 |
+
const [error, setError] = useState("");
|
| 18 |
+
|
| 19 |
+
// Load config
|
| 20 |
+
const [source, setSource] = useState<"raw" | "embeddings">("raw");
|
| 21 |
+
const [maxDocs, setMaxDocs] = useState(500);
|
| 22 |
+
const [minTextLen, setMinTextLen] = useState(100);
|
| 23 |
+
const [sourceFilter, setSourceFilter] = useState("");
|
| 24 |
+
const [loadAll, setLoadAll] = useState(true);
|
| 25 |
+
const [buildIndex, setBuildIndex] = useState(true);
|
| 26 |
+
const [loading, setLoading] = useState(false);
|
| 27 |
+
const [loadResult, setLoadResult] = useState<DatasetLoadResponse | null>(null);
|
| 28 |
+
const [showAdvanced, setShowAdvanced] = useState(false);
|
| 29 |
+
|
| 30 |
+
// Preview
|
| 31 |
+
const [previewDocs, setPreviewDocs] = useState<DatasetPreviewDoc[]>([]);
|
| 32 |
+
const [previewLoading, setPreviewLoading] = useState(false);
|
| 33 |
+
|
| 34 |
+
useEffect(() => {
|
| 35 |
+
api.datasetInfo().then(setInfo).catch((err) => {
|
| 36 |
+
setError(getErrorMessage(err));
|
| 37 |
+
});
|
| 38 |
+
}, []);
|
| 39 |
+
|
| 40 |
+
async function handlePreview() {
|
| 41 |
+
setPreviewLoading(true); setError("");
|
| 42 |
+
try {
|
| 43 |
+
const res = await api.datasetPreview(10, sourceFilter || undefined);
|
| 44 |
+
setPreviewDocs(res.documents);
|
| 45 |
+
} catch (err) {
|
| 46 |
+
setError(getErrorMessage(err));
|
| 47 |
+
} finally {
|
| 48 |
+
setPreviewLoading(false);
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
async function handleLoad() {
|
| 53 |
+
setLoading(true); setError(""); setLoadResult(null);
|
| 54 |
+
try {
|
| 55 |
+
const res = await api.datasetLoad({
|
| 56 |
+
source,
|
| 57 |
+
max_docs: loadAll ? 100000 : maxDocs,
|
| 58 |
+
min_text_length: loadAll ? 0 : minTextLen,
|
| 59 |
+
source_filter: sourceFilter || undefined,
|
| 60 |
+
build_index: buildIndex,
|
| 61 |
+
});
|
| 62 |
+
setLoadResult(res);
|
| 63 |
+
if (onStatsUpdate) {
|
| 64 |
+
try { const s = await api.getStats(); onStatsUpdate(s); } catch (e) {
|
| 65 |
+
console.warn("Failed to refresh stats after load:", e);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
} catch (err) {
|
| 69 |
+
setError(getErrorMessage(err));
|
| 70 |
+
} finally {
|
| 71 |
+
setLoading(false);
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
return (
|
| 76 |
+
<div>
|
| 77 |
+
{/* Info */}
|
| 78 |
+
<div className="panel">
|
| 79 |
+
<h2>Epstein Files Dataset</h2>
|
| 80 |
+
<p className="panel-desc">
|
| 81 |
+
Load documents from the publicly released U.S. House Oversight Committee Epstein Files
|
| 82 |
+
via HuggingFace. Two sources available:
|
| 83 |
+
</p>
|
| 84 |
+
|
| 85 |
+
{info && (
|
| 86 |
+
<div style={{ display: "flex", gap: 12, flexWrap: "wrap", marginBottom: 16 }}>
|
| 87 |
+
<div className={`result-card ${source === "raw" ? "result-card-selected" : ""}`}
|
| 88 |
+
style={{ flex: "1 1 280px", cursor: "pointer" }}
|
| 89 |
+
onClick={() => setSource("raw")}>
|
| 90 |
+
<div className="result-header">
|
| 91 |
+
<strong>Raw Text Documents</strong>
|
| 92 |
+
<span className="badge">{info.raw_texts.size_mb} MB</span>
|
| 93 |
+
</div>
|
| 94 |
+
<div className="result-text">{info.raw_texts.description}</div>
|
| 95 |
+
<div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
|
| 96 |
+
Columns: {info.raw_texts.columns?.join(", ")}
|
| 97 |
+
</div>
|
| 98 |
+
</div>
|
| 99 |
+
<div className={`result-card ${source === "embeddings" ? "result-card-selected" : ""}`}
|
| 100 |
+
style={{ flex: "1 1 280px", cursor: "pointer" }}
|
| 101 |
+
onClick={() => setSource("embeddings")}>
|
| 102 |
+
<div className="result-header">
|
| 103 |
+
<strong>Pre-computed Embeddings</strong>
|
| 104 |
+
<span className="badge">{info.embeddings.vector_dim}d</span>
|
| 105 |
+
</div>
|
| 106 |
+
<div className="result-text">{info.embeddings.description}</div>
|
| 107 |
+
<div style={{ marginTop: 8, fontSize: "0.75rem", color: "var(--text-dim)" }}>
|
| 108 |
+
Model: {info.embeddings.model}
|
| 109 |
+
</div>
|
| 110 |
+
</div>
|
| 111 |
+
</div>
|
| 112 |
+
)}
|
| 113 |
+
|
| 114 |
+
<Toggle
|
| 115 |
+
options={[
|
| 116 |
+
{ value: "raw", label: "Raw Texts" },
|
| 117 |
+
{ value: "embeddings", label: "ChromaDB Embeddings" },
|
| 118 |
+
]}
|
| 119 |
+
value={source}
|
| 120 |
+
onChange={(v) => setSource(v as "raw" | "embeddings")}
|
| 121 |
+
/>
|
| 122 |
+
</div>
|
| 123 |
+
|
| 124 |
+
{/* Load actions + advanced config */}
|
| 125 |
+
<div className="panel">
|
| 126 |
+
<h2>Load Dataset</h2>
|
| 127 |
+
<div style={{ display: "flex", gap: 8, marginBottom: 12 }}>
|
| 128 |
+
<button className="btn btn-primary" onClick={handleLoad}
|
| 129 |
+
disabled={loading}>
|
| 130 |
+
{loading ? <><span className="spinner" /> Loading Dataset...</> : "Load into Engine"}
|
| 131 |
+
</button>
|
| 132 |
+
{source === "raw" && (
|
| 133 |
+
<button className="btn btn-secondary" onClick={handlePreview}
|
| 134 |
+
disabled={previewLoading}>
|
| 135 |
+
{previewLoading ? "Loading..." : "Preview Documents"}
|
| 136 |
+
</button>
|
| 137 |
+
)}
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
|
| 141 |
+
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
|
| 142 |
+
</button>
|
| 143 |
+
|
| 144 |
+
{showAdvanced && (
|
| 145 |
+
<div className="advanced-section">
|
| 146 |
+
<div className="form-row">
|
| 147 |
+
<div className="form-group" style={{ maxWidth: 200 }}>
|
| 148 |
+
<label>Load All Documents</label>
|
| 149 |
+
<Switch checked={loadAll} onChange={setLoadAll}
|
| 150 |
+
label={loadAll ? "Yes (no limits)" : "No (use filters below)"} />
|
| 151 |
+
</div>
|
| 152 |
+
{!loadAll && (
|
| 153 |
+
<>
|
| 154 |
+
<div className="form-group" style={{ maxWidth: 140 }}>
|
| 155 |
+
<label>Max Documents</label>
|
| 156 |
+
<input type="number" value={maxDocs} onChange={e => setMaxDocs(+e.target.value)}
|
| 157 |
+
min={10} max={100000} />
|
| 158 |
+
</div>
|
| 159 |
+
{source === "raw" && (
|
| 160 |
+
<div className="form-group" style={{ maxWidth: 140 }}>
|
| 161 |
+
<label>Min Text Length</label>
|
| 162 |
+
<input type="number" value={minTextLen} onChange={e => setMinTextLen(+e.target.value)}
|
| 163 |
+
min={0} max={10000} />
|
| 164 |
+
</div>
|
| 165 |
+
)}
|
| 166 |
+
</>
|
| 167 |
+
)}
|
| 168 |
+
{source === "raw" && (
|
| 169 |
+
<div className="form-group" style={{ maxWidth: 220 }}>
|
| 170 |
+
<label>Source Filter</label>
|
| 171 |
+
<Select
|
| 172 |
+
options={[
|
| 173 |
+
{ value: "", label: "All sources" },
|
| 174 |
+
{ value: "TEXT-", label: "TEXT- (native text files)" },
|
| 175 |
+
{ value: "IMAGES-", label: "IMAGES- (OCR from images)" },
|
| 176 |
+
]}
|
| 177 |
+
value={sourceFilter}
|
| 178 |
+
onChange={setSourceFilter}
|
| 179 |
+
/>
|
| 180 |
+
</div>
|
| 181 |
+
)}
|
| 182 |
+
<div className="form-group" style={{ maxWidth: 200 }}>
|
| 183 |
+
<label>Build Index</label>
|
| 184 |
+
<Switch checked={buildIndex} onChange={setBuildIndex}
|
| 185 |
+
label={buildIndex ? "Yes (ready to search)" : "No (load only)"} />
|
| 186 |
+
</div>
|
| 187 |
+
</div>
|
| 188 |
+
</div>
|
| 189 |
+
)}
|
| 190 |
+
|
| 191 |
+
{loading && (
|
| 192 |
+
<StatusMessage type="loading"
|
| 193 |
+
message="Downloading from HuggingFace and indexing. This may take several minutes for large datasets..." />
|
| 194 |
+
)}
|
| 195 |
+
|
| 196 |
+
<LogViewer active={loading} />
|
| 197 |
+
</div>
|
| 198 |
+
|
| 199 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 200 |
+
|
| 201 |
+
{/* Load result */}
|
| 202 |
+
{loadResult && (
|
| 203 |
+
<div className="panel">
|
| 204 |
+
<h2>Dataset Loaded</h2>
|
| 205 |
+
<div className="metric-grid mb-2">
|
| 206 |
+
{loadResult.documents_loaded !== undefined && (
|
| 207 |
+
<MetricCard value={loadResult.documents_loaded} label="Documents" />
|
| 208 |
+
)}
|
| 209 |
+
{loadResult.documents_created !== undefined && (
|
| 210 |
+
<MetricCard value={loadResult.documents_created} label="Documents" />
|
| 211 |
+
)}
|
| 212 |
+
{(loadResult.total_chunks || loadResult.chunks_indexed) && (
|
| 213 |
+
<MetricCard value={loadResult.total_chunks || loadResult.chunks_indexed || 0} label="Chunks" />
|
| 214 |
+
)}
|
| 215 |
+
{loadResult.chromadb_vectors !== undefined && (
|
| 216 |
+
<MetricCard value={loadResult.chromadb_vectors} label="Vectors Imported" />
|
| 217 |
+
)}
|
| 218 |
+
<MetricCard value={`${loadResult.seconds}s`} label="Time" />
|
| 219 |
+
</div>
|
| 220 |
+
<StatusMessage type="ok"
|
| 221 |
+
message={loadResult.index_built
|
| 222 |
+
? "Dataset loaded and FAISS index built. You can now search, analyze keywords, and run evaluations."
|
| 223 |
+
: "Dataset loaded. Build the index from the Setup tab to enable search."} />
|
| 224 |
+
</div>
|
| 225 |
+
)}
|
| 226 |
+
|
| 227 |
+
{/* Preview */}
|
| 228 |
+
{previewDocs.length > 0 && (
|
| 229 |
+
<div className="panel">
|
| 230 |
+
<h2>Document Preview ({previewDocs.length} docs)</h2>
|
| 231 |
+
{previewDocs.map((doc, i) => (
|
| 232 |
+
<div key={i} className="result-card" style={{ marginBottom: 8 }}>
|
| 233 |
+
<div className="result-header">
|
| 234 |
+
<span style={{ fontWeight: 600, fontSize: "0.85rem" }}>{doc.filename}</span>
|
| 235 |
+
<span className="badge">{(doc.text_length / 1000).toFixed(1)}K chars</span>
|
| 236 |
+
</div>
|
| 237 |
+
<div className="result-text" style={{ whiteSpace: "pre-wrap" }}>
|
| 238 |
+
{doc.text_preview}
|
| 239 |
+
</div>
|
| 240 |
+
</div>
|
| 241 |
+
))}
|
| 242 |
+
</div>
|
| 243 |
+
)}
|
| 244 |
+
</div>
|
| 245 |
+
);
|
| 246 |
+
}
|
frontend/src/components/EngineSetup.tsx
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { CorpusStats } from "../types";
|
| 4 |
+
import StatusMessage from "./StatusMessage";
|
| 5 |
+
import Select from "./Select";
|
| 6 |
+
|
| 7 |
+
interface Props {
|
| 8 |
+
onStatsUpdate: (stats: CorpusStats) => void;
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
const MODELS = [
|
| 12 |
+
{ value: "all-MiniLM-L6-v2", label: "all-MiniLM-L6-v2 (fast, 384-dim)" },
|
| 13 |
+
{ value: "all-mpnet-base-v2", label: "all-mpnet-base-v2 (best quality, 768-dim)" },
|
| 14 |
+
{ value: "BAAI/bge-large-en-v1.5", label: "BAAI/bge-large-en-v1.5 (high accuracy, 1024-dim)" },
|
| 15 |
+
];
|
| 16 |
+
|
| 17 |
+
export default function EngineSetup({ onStatsUpdate }: Props) {
|
| 18 |
+
const [model, setModel] = useState("all-MiniLM-L6-v2");
|
| 19 |
+
const [chunkSize, setChunkSize] = useState(512);
|
| 20 |
+
const [chunkOverlap, setChunkOverlap] = useState(128);
|
| 21 |
+
const [batchSize, setBatchSize] = useState(64);
|
| 22 |
+
|
| 23 |
+
const [docId, setDocId] = useState("");
|
| 24 |
+
const [docText, setDocText] = useState("");
|
| 25 |
+
|
| 26 |
+
const [showAdvanced, setShowAdvanced] = useState(false);
|
| 27 |
+
const [status, setStatus] = useState<{ type: "ok" | "err" | "loading"; msg: string } | null>(null);
|
| 28 |
+
const [initialized, setInitialized] = useState(false);
|
| 29 |
+
const [docsAdded, setDocsAdded] = useState<string[]>([]);
|
| 30 |
+
|
| 31 |
+
async function handleInit() {
|
| 32 |
+
setStatus({ type: "loading", msg: "Loading model..." });
|
| 33 |
+
try {
|
| 34 |
+
const res = await api.init({
|
| 35 |
+
model_name: model,
|
| 36 |
+
chunk_size: chunkSize,
|
| 37 |
+
chunk_overlap: chunkOverlap,
|
| 38 |
+
batch_size: batchSize,
|
| 39 |
+
});
|
| 40 |
+
setInitialized(true);
|
| 41 |
+
setDocsAdded([]);
|
| 42 |
+
setStatus({ type: "ok", msg: `Model "${res.model}" loaded in ${res.load_time_seconds}s` });
|
| 43 |
+
} catch (e: unknown) {
|
| 44 |
+
setStatus({ type: "err", msg: getErrorMessage(e) });
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
async function handleAddDoc() {
|
| 49 |
+
if (!docId.trim() || !docText.trim()) return;
|
| 50 |
+
setStatus({ type: "loading", msg: `Adding document "${docId}"...` });
|
| 51 |
+
try {
|
| 52 |
+
const res = await api.addDocument({ doc_id: docId, text: docText });
|
| 53 |
+
setDocsAdded((prev) => [...prev, res.doc_id]);
|
| 54 |
+
setStatus({ type: "ok", msg: `Added "${res.doc_id}": ${res.num_chunks} chunks` });
|
| 55 |
+
setDocId("");
|
| 56 |
+
setDocText("");
|
| 57 |
+
} catch (e: unknown) {
|
| 58 |
+
setStatus({ type: "err", msg: getErrorMessage(e) });
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
async function handleBuildIndex() {
|
| 63 |
+
setStatus({ type: "loading", msg: "Building FAISS index..." });
|
| 64 |
+
try {
|
| 65 |
+
const res = await api.buildIndex();
|
| 66 |
+
setStatus({
|
| 67 |
+
type: "ok",
|
| 68 |
+
msg: `Index built: ${res.total_chunks} vectors (dim=${res.embedding_dim}) in ${res.build_time_seconds}s`,
|
| 69 |
+
});
|
| 70 |
+
const stats = await api.getStats();
|
| 71 |
+
onStatsUpdate(stats);
|
| 72 |
+
} catch (e: unknown) {
|
| 73 |
+
setStatus({ type: "err", msg: getErrorMessage(e) });
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
return (
|
| 78 |
+
<div>
|
| 79 |
+
{/* Step 1: Initialize engine */}
|
| 80 |
+
<div className="panel">
|
| 81 |
+
<h2>1. Initialize Engine</h2>
|
| 82 |
+
<div className="form-row">
|
| 83 |
+
<div className="form-group">
|
| 84 |
+
<label>Model</label>
|
| 85 |
+
<Select options={MODELS} value={model} onChange={setModel} />
|
| 86 |
+
</div>
|
| 87 |
+
</div>
|
| 88 |
+
|
| 89 |
+
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
|
| 90 |
+
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
|
| 91 |
+
</button>
|
| 92 |
+
|
| 93 |
+
{showAdvanced && (
|
| 94 |
+
<div className="advanced-section">
|
| 95 |
+
<div className="form-row">
|
| 96 |
+
<div className="form-group form-group-md">
|
| 97 |
+
<label>Chunk Size</label>
|
| 98 |
+
<input type="number" value={chunkSize} onChange={(e) => setChunkSize(+e.target.value)} />
|
| 99 |
+
</div>
|
| 100 |
+
<div className="form-group form-group-md">
|
| 101 |
+
<label>Overlap</label>
|
| 102 |
+
<input type="number" value={chunkOverlap} onChange={(e) => setChunkOverlap(+e.target.value)} />
|
| 103 |
+
</div>
|
| 104 |
+
<div className="form-group form-group-md">
|
| 105 |
+
<label>Batch Size</label>
|
| 106 |
+
<input type="number" value={batchSize} onChange={(e) => setBatchSize(+e.target.value)} />
|
| 107 |
+
</div>
|
| 108 |
+
</div>
|
| 109 |
+
</div>
|
| 110 |
+
)}
|
| 111 |
+
|
| 112 |
+
<button className="btn btn-primary" onClick={handleInit} style={{ marginTop: 8 }}>
|
| 113 |
+
Initialize
|
| 114 |
+
</button>
|
| 115 |
+
</div>
|
| 116 |
+
|
| 117 |
+
{/* Step 2: Add documents */}
|
| 118 |
+
<div className="panel">
|
| 119 |
+
<h2>2. Add Documents</h2>
|
| 120 |
+
{docsAdded.length > 0 && (
|
| 121 |
+
<div style={{ marginBottom: 12 }}>
|
| 122 |
+
{docsAdded.map((id) => (
|
| 123 |
+
<span key={id} className="tag">{id}</span>
|
| 124 |
+
))}
|
| 125 |
+
</div>
|
| 126 |
+
)}
|
| 127 |
+
<div className="form-row">
|
| 128 |
+
<div className="form-group form-group-lg">
|
| 129 |
+
<label>Document ID</label>
|
| 130 |
+
<input
|
| 131 |
+
value={docId}
|
| 132 |
+
onChange={(e) => setDocId(e.target.value)}
|
| 133 |
+
placeholder="e.g. chapter_1"
|
| 134 |
+
disabled={!initialized}
|
| 135 |
+
/>
|
| 136 |
+
</div>
|
| 137 |
+
</div>
|
| 138 |
+
<div className="form-group mb-2">
|
| 139 |
+
<label>Document Text</label>
|
| 140 |
+
<textarea
|
| 141 |
+
value={docText}
|
| 142 |
+
onChange={(e) => setDocText(e.target.value)}
|
| 143 |
+
placeholder="Paste your document text here..."
|
| 144 |
+
rows={8}
|
| 145 |
+
disabled={!initialized}
|
| 146 |
+
/>
|
| 147 |
+
</div>
|
| 148 |
+
<button className="btn btn-primary" onClick={handleAddDoc} disabled={!initialized || !docId || !docText}>
|
| 149 |
+
Add Document
|
| 150 |
+
</button>
|
| 151 |
+
</div>
|
| 152 |
+
|
| 153 |
+
{/* Step 3: Build index */}
|
| 154 |
+
<div className="panel">
|
| 155 |
+
<h2>3. Build Index</h2>
|
| 156 |
+
<p className="panel-desc">
|
| 157 |
+
Embeds all chunks and builds a FAISS index for fast similarity search.
|
| 158 |
+
This must be done after adding all documents.
|
| 159 |
+
</p>
|
| 160 |
+
<button
|
| 161 |
+
className="btn btn-primary"
|
| 162 |
+
onClick={handleBuildIndex}
|
| 163 |
+
disabled={!initialized || docsAdded.length === 0}
|
| 164 |
+
>
|
| 165 |
+
Build Index
|
| 166 |
+
</button>
|
| 167 |
+
</div>
|
| 168 |
+
|
| 169 |
+
{status && <StatusMessage type={status.type} message={status.msg} />}
|
| 170 |
+
</div>
|
| 171 |
+
);
|
| 172 |
+
}
|
frontend/src/components/EvaluationDashboard.tsx
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import {
|
| 3 |
+
BarChart,
|
| 4 |
+
Bar,
|
| 5 |
+
XAxis,
|
| 6 |
+
YAxis,
|
| 7 |
+
CartesianGrid,
|
| 8 |
+
Tooltip,
|
| 9 |
+
ResponsiveContainer,
|
| 10 |
+
Cell,
|
| 11 |
+
} from "recharts";
|
| 12 |
+
import { api, getErrorMessage } from "../api";
|
| 13 |
+
import type { EvalSection, SimilarityDistribution, DisambiguationMetric, RetrievalMetric } from "../types";
|
| 14 |
+
import StatusMessage from "./StatusMessage";
|
| 15 |
+
import MetricCard from "./MetricCard";
|
| 16 |
+
|
| 17 |
+
// ---- Structured form types ----
|
| 18 |
+
|
| 19 |
+
interface GtRow {
|
| 20 |
+
text: string;
|
| 21 |
+
meaning: string;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
interface RetrievalRow {
|
| 25 |
+
query: string;
|
| 26 |
+
relevantText: string;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
// ---- Example data ----
|
| 30 |
+
|
| 31 |
+
const EXAMPLE_KEYWORD = "pizza";
|
| 32 |
+
const EXAMPLE_MEANINGS = [
|
| 33 |
+
"school, education, and academic activities like homework and tests",
|
| 34 |
+
"food, Italian cuisine, restaurant, cooking, and eating",
|
| 35 |
+
];
|
| 36 |
+
const EXAMPLE_GT: GtRow[] = [
|
| 37 |
+
{ text: "I love pizza so much, I go there every day", meaning: "school" },
|
| 38 |
+
{ text: "pizza gives me homework", meaning: "school" },
|
| 39 |
+
{ text: "she made the best margherita pizza in the city", meaning: "food" },
|
| 40 |
+
{ text: "pizza dough recipe used tipo 00 flour", meaning: "food" },
|
| 41 |
+
{ text: "The pizza test is going to be so hard", meaning: "school" },
|
| 42 |
+
{ text: "This pizza is amazing, the crust is perfectly crispy", meaning: "food" },
|
| 43 |
+
];
|
| 44 |
+
|
| 45 |
+
const EXAMPLE_RETRIEVAL: RetrievalRow[] = [
|
| 46 |
+
{ query: "kids using secret code words for school", relevantText: "secret language" },
|
| 47 |
+
{ query: "Italian restaurant with wood-fired oven", relevantText: "pizza" },
|
| 48 |
+
];
|
| 49 |
+
|
| 50 |
+
// ---- Meaning label helpers ----
|
| 51 |
+
|
| 52 |
+
function getMeaningLabels(meanings: string[]): string[] {
|
| 53 |
+
return meanings.map((m) => {
|
| 54 |
+
const first = m.split(",")[0].trim();
|
| 55 |
+
return first.length > 20 ? first.slice(0, 20) : first;
|
| 56 |
+
});
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// ---- Tab config ----
|
| 60 |
+
|
| 61 |
+
const EVAL_TABS: { id: EvalSection; label: string; desc: string }[] = [
|
| 62 |
+
{
|
| 63 |
+
id: "distribution",
|
| 64 |
+
label: "Distribution",
|
| 65 |
+
desc: "Analyze pairwise similarity distribution across your corpus. One-click — no setup needed.",
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
id: "disambiguation",
|
| 69 |
+
label: "Disambiguation",
|
| 70 |
+
desc: "Test whether the engine can tell apart different meanings of the same word. Provide example sentences and label each with the intended meaning.",
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
id: "retrieval",
|
| 74 |
+
label: "Retrieval",
|
| 75 |
+
desc: "Measure how well the engine finds relevant documents for a given query. Provide search queries and what text they should match.",
|
| 76 |
+
},
|
| 77 |
+
];
|
| 78 |
+
|
| 79 |
+
export default function EvaluationDashboard() {
|
| 80 |
+
const [section, setSection] = useState<EvalSection>("distribution");
|
| 81 |
+
const [distrib, setDistrib] = useState<SimilarityDistribution | null>(null);
|
| 82 |
+
const [disambig, setDisambig] = useState<DisambiguationMetric[] | null>(null);
|
| 83 |
+
const [retrieval, setRetrieval] = useState<RetrievalMetric[] | null>(null);
|
| 84 |
+
const [loading, setLoading] = useState("");
|
| 85 |
+
const [error, setError] = useState("");
|
| 86 |
+
|
| 87 |
+
// Disambiguation structured form
|
| 88 |
+
const [keyword, setKeyword] = useState("");
|
| 89 |
+
const [meanings, setMeanings] = useState<string[]>(["", ""]);
|
| 90 |
+
const [gtRows, setGtRows] = useState<GtRow[]>([{ text: "", meaning: "" }]);
|
| 91 |
+
|
| 92 |
+
// Retrieval structured form
|
| 93 |
+
const [retRows, setRetRows] = useState<RetrievalRow[]>([{ query: "", relevantText: "" }]);
|
| 94 |
+
|
| 95 |
+
// ---- Distribution ----
|
| 96 |
+
|
| 97 |
+
async function fetchDistribution() {
|
| 98 |
+
setLoading("distrib");
|
| 99 |
+
setError("");
|
| 100 |
+
try {
|
| 101 |
+
setDistrib(await api.getSimilarityDistribution());
|
| 102 |
+
} catch (err) {
|
| 103 |
+
setError(getErrorMessage(err));
|
| 104 |
+
} finally {
|
| 105 |
+
setLoading("");
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
// ---- Disambiguation ----
|
| 110 |
+
|
| 111 |
+
function loadDisambiguationExample() {
|
| 112 |
+
setKeyword(EXAMPLE_KEYWORD);
|
| 113 |
+
setMeanings([...EXAMPLE_MEANINGS]);
|
| 114 |
+
setGtRows(EXAMPLE_GT.map((r) => ({ ...r })));
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
function updateMeaning(i: number, val: string) {
|
| 118 |
+
const next = [...meanings];
|
| 119 |
+
next[i] = val;
|
| 120 |
+
setMeanings(next);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
function addMeaning() {
|
| 124 |
+
setMeanings([...meanings, ""]);
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
function removeMeaning(i: number) {
|
| 128 |
+
if (meanings.length <= 2) return;
|
| 129 |
+
setMeanings(meanings.filter((_, idx) => idx !== i));
|
| 130 |
+
// Update GT rows that referenced removed meaning
|
| 131 |
+
const labels = getMeaningLabels(meanings);
|
| 132 |
+
const removed = labels[i];
|
| 133 |
+
setGtRows(gtRows.map((r) => (r.meaning === removed ? { ...r, meaning: "" } : r)));
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
function updateGtRow(i: number, field: keyof GtRow, val: string) {
|
| 137 |
+
const next = [...gtRows];
|
| 138 |
+
next[i] = { ...next[i], [field]: val };
|
| 139 |
+
setGtRows(next);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
function addGtRow() {
|
| 143 |
+
setGtRows([...gtRows, { text: "", meaning: "" }]);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
function removeGtRow(i: number) {
|
| 147 |
+
if (gtRows.length <= 1) return;
|
| 148 |
+
setGtRows(gtRows.filter((_, idx) => idx !== i));
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
async function runDisambiguation() {
|
| 152 |
+
if (!keyword.trim()) { setError("Enter a keyword."); return; }
|
| 153 |
+
const validMeanings = meanings.filter((m) => m.trim());
|
| 154 |
+
if (validMeanings.length < 2) { setError("Add at least 2 meanings."); return; }
|
| 155 |
+
const validGt = gtRows.filter((r) => r.text.trim() && r.meaning);
|
| 156 |
+
if (validGt.length < 2) { setError("Add at least 2 labeled examples."); return; }
|
| 157 |
+
|
| 158 |
+
setLoading("disambig");
|
| 159 |
+
setError("");
|
| 160 |
+
try {
|
| 161 |
+
const labels = getMeaningLabels(meanings);
|
| 162 |
+
const ground_truth = validGt.map((r) => ({
|
| 163 |
+
keyword: keyword.trim(),
|
| 164 |
+
text: r.text,
|
| 165 |
+
true_meaning: r.meaning,
|
| 166 |
+
}));
|
| 167 |
+
const candidate_meanings: Record<string, string[]> = {
|
| 168 |
+
[keyword.trim()]: validMeanings,
|
| 169 |
+
};
|
| 170 |
+
// Map GT meaning labels back to full candidate strings for the API
|
| 171 |
+
// The API compares against candidates, so true_meaning should match a candidate label
|
| 172 |
+
// We use short labels for the dropdown, but the API uses them as-is for matching
|
| 173 |
+
const res = await api.evalDisambiguation({ ground_truth, candidate_meanings });
|
| 174 |
+
setDisambig(res.metrics);
|
| 175 |
+
} catch (e) {
|
| 176 |
+
setError(getErrorMessage(e));
|
| 177 |
+
} finally {
|
| 178 |
+
setLoading("");
|
| 179 |
+
}
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
// ---- Retrieval ----
|
| 183 |
+
|
| 184 |
+
function loadRetrievalExample() {
|
| 185 |
+
setRetRows(EXAMPLE_RETRIEVAL.map((r) => ({ ...r })));
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
function updateRetRow(i: number, field: keyof RetrievalRow, val: string) {
|
| 189 |
+
const next = [...retRows];
|
| 190 |
+
next[i] = { ...next[i], [field]: val };
|
| 191 |
+
setRetRows(next);
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
function addRetRow() {
|
| 195 |
+
setRetRows([...retRows, { query: "", relevantText: "" }]);
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
function removeRetRow(i: number) {
|
| 199 |
+
if (retRows.length <= 1) return;
|
| 200 |
+
setRetRows(retRows.filter((_, idx) => idx !== i));
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
async function runRetrieval() {
|
| 204 |
+
const valid = retRows.filter((r) => r.query.trim());
|
| 205 |
+
if (valid.length === 0) { setError("Add at least one query."); return; }
|
| 206 |
+
|
| 207 |
+
setLoading("retrieval");
|
| 208 |
+
setError("");
|
| 209 |
+
try {
|
| 210 |
+
const queries = valid.map((r) => ({
|
| 211 |
+
query: r.query,
|
| 212 |
+
relevant_texts: r.relevantText.trim() ? [r.relevantText.trim()] : [],
|
| 213 |
+
}));
|
| 214 |
+
const res = await api.evalRetrieval({ queries, k_values: [1, 3, 5, 10] });
|
| 215 |
+
setRetrieval(res.metrics);
|
| 216 |
+
} catch (e) {
|
| 217 |
+
setError(getErrorMessage(e));
|
| 218 |
+
} finally {
|
| 219 |
+
setLoading("");
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
// ---- Meaning labels for dropdown ----
|
| 224 |
+
const meaningLabels = getMeaningLabels(meanings);
|
| 225 |
+
|
| 226 |
+
return (
|
| 227 |
+
<div>
|
| 228 |
+
<nav className="subtabs mb-2">
|
| 229 |
+
{EVAL_TABS.map((t) => (
|
| 230 |
+
<button
|
| 231 |
+
key={t.id}
|
| 232 |
+
className={`subtab ${section === t.id ? "subtab-active" : ""}`}
|
| 233 |
+
onClick={() => { setSection(t.id); setError(""); }}
|
| 234 |
+
>
|
| 235 |
+
{t.label}
|
| 236 |
+
</button>
|
| 237 |
+
))}
|
| 238 |
+
</nav>
|
| 239 |
+
|
| 240 |
+
<p className="panel-desc">{EVAL_TABS.find((t) => t.id === section)?.desc}</p>
|
| 241 |
+
|
| 242 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 243 |
+
|
| 244 |
+
{/* ---- Similarity Distribution ---- */}
|
| 245 |
+
{section === "distribution" && (
|
| 246 |
+
<div className="panel">
|
| 247 |
+
<button className="btn btn-primary" onClick={fetchDistribution} disabled={loading === "distrib"}>
|
| 248 |
+
{loading === "distrib" ? "Computing..." : "Compute Distribution"}
|
| 249 |
+
</button>
|
| 250 |
+
|
| 251 |
+
{distrib && (
|
| 252 |
+
<div className="mt-2">
|
| 253 |
+
<div className="metric-grid mb-3">
|
| 254 |
+
{[
|
| 255 |
+
{ label: "Mean", value: distrib.mean },
|
| 256 |
+
{ label: "Std Dev", value: distrib.std },
|
| 257 |
+
{ label: "Min", value: distrib.min },
|
| 258 |
+
{ label: "Max", value: distrib.max },
|
| 259 |
+
].map((m) => (
|
| 260 |
+
<MetricCard key={m.label} value={m.value.toFixed(3)} label={m.label} />
|
| 261 |
+
))}
|
| 262 |
+
</div>
|
| 263 |
+
|
| 264 |
+
<h3>Histogram</h3>
|
| 265 |
+
<ResponsiveContainer width="100%" height={250}>
|
| 266 |
+
<BarChart data={distrib.histogram}>
|
| 267 |
+
<CartesianGrid strokeDasharray="3 3" stroke="var(--border)" />
|
| 268 |
+
<XAxis
|
| 269 |
+
dataKey="bin_start"
|
| 270 |
+
tick={{ fill: "var(--text-dim)", fontSize: 11 }}
|
| 271 |
+
tickFormatter={(v: number) => v.toFixed(1)}
|
| 272 |
+
/>
|
| 273 |
+
<YAxis tick={{ fill: "var(--text-dim)", fontSize: 11 }} />
|
| 274 |
+
<Tooltip
|
| 275 |
+
contentStyle={{
|
| 276 |
+
background: "var(--surface)",
|
| 277 |
+
border: "1px solid var(--border)",
|
| 278 |
+
borderRadius: 6,
|
| 279 |
+
color: "var(--text)",
|
| 280 |
+
}}
|
| 281 |
+
formatter={(value: unknown) => [Number(value), "Count"]}
|
| 282 |
+
labelFormatter={(v: unknown) => `Similarity: ${Number(v).toFixed(2)}`}
|
| 283 |
+
/>
|
| 284 |
+
<Bar dataKey="count" radius={[4, 4, 0, 0]}>
|
| 285 |
+
{distrib.histogram.map((entry, i) => (
|
| 286 |
+
<Cell
|
| 287 |
+
key={i}
|
| 288 |
+
fill={entry.bin_start >= 0.5 ? "var(--ok)" : entry.bin_start >= 0 ? "var(--accent)" : "var(--err)"}
|
| 289 |
+
/>
|
| 290 |
+
))}
|
| 291 |
+
</Bar>
|
| 292 |
+
</BarChart>
|
| 293 |
+
</ResponsiveContainer>
|
| 294 |
+
|
| 295 |
+
<h3 className="mt-2">Percentiles</h3>
|
| 296 |
+
<table className="data-table">
|
| 297 |
+
<thead>
|
| 298 |
+
<tr>
|
| 299 |
+
{Object.keys(distrib.percentiles).map((p) => (
|
| 300 |
+
<th key={p}>P{p}</th>
|
| 301 |
+
))}
|
| 302 |
+
</tr>
|
| 303 |
+
</thead>
|
| 304 |
+
<tbody>
|
| 305 |
+
<tr>
|
| 306 |
+
{Object.values(distrib.percentiles).map((v, i) => (
|
| 307 |
+
<td key={i}>{v.toFixed(4)}</td>
|
| 308 |
+
))}
|
| 309 |
+
</tr>
|
| 310 |
+
</tbody>
|
| 311 |
+
</table>
|
| 312 |
+
</div>
|
| 313 |
+
)}
|
| 314 |
+
</div>
|
| 315 |
+
)}
|
| 316 |
+
|
| 317 |
+
{/* ---- Disambiguation Evaluation ---- */}
|
| 318 |
+
{section === "disambiguation" && (
|
| 319 |
+
<div className="panel">
|
| 320 |
+
<div className="flex-row gap-2 mb-2">
|
| 321 |
+
<button className="btn btn-secondary" onClick={loadDisambiguationExample}>
|
| 322 |
+
Load Example
|
| 323 |
+
</button>
|
| 324 |
+
</div>
|
| 325 |
+
|
| 326 |
+
{/* Keyword */}
|
| 327 |
+
<div className="form-group mb-2" style={{ maxWidth: 300 }}>
|
| 328 |
+
<label>Keyword</label>
|
| 329 |
+
<input
|
| 330 |
+
value={keyword}
|
| 331 |
+
onChange={(e) => setKeyword(e.target.value)}
|
| 332 |
+
placeholder='e.g. "pizza"'
|
| 333 |
+
/>
|
| 334 |
+
</div>
|
| 335 |
+
|
| 336 |
+
{/* Candidate Meanings */}
|
| 337 |
+
<div className="mb-2">
|
| 338 |
+
<label className="section-label">
|
| 339 |
+
Candidate Meanings
|
| 340 |
+
<span className="text-dim"> — describe each possible meaning</span>
|
| 341 |
+
</label>
|
| 342 |
+
{meanings.map((m, i) => (
|
| 343 |
+
<div key={i} className="flex-row gap-1 mb-1">
|
| 344 |
+
<span className="text-dim" style={{ minWidth: 24 }}>{i + 1}.</span>
|
| 345 |
+
<input
|
| 346 |
+
value={m}
|
| 347 |
+
onChange={(e) => updateMeaning(i, e.target.value)}
|
| 348 |
+
placeholder={`Meaning ${i + 1} description...`}
|
| 349 |
+
style={{ flex: 1 }}
|
| 350 |
+
/>
|
| 351 |
+
{meanings.length > 2 && (
|
| 352 |
+
<button className="btn btn-secondary" onClick={() => removeMeaning(i)}>
|
| 353 |
+
×
|
| 354 |
+
</button>
|
| 355 |
+
)}
|
| 356 |
+
</div>
|
| 357 |
+
))}
|
| 358 |
+
<button className="btn btn-secondary mt-1" onClick={addMeaning}>
|
| 359 |
+
+ Add Meaning
|
| 360 |
+
</button>
|
| 361 |
+
</div>
|
| 362 |
+
|
| 363 |
+
{/* Ground Truth Examples */}
|
| 364 |
+
<div className="mb-2">
|
| 365 |
+
<label className="section-label">
|
| 366 |
+
Labeled Examples
|
| 367 |
+
<span className="text-dim"> — sentences using the keyword, with the correct meaning</span>
|
| 368 |
+
</label>
|
| 369 |
+
<table className="data-table">
|
| 370 |
+
<thead>
|
| 371 |
+
<tr>
|
| 372 |
+
<th style={{ width: "60%" }}>Sentence</th>
|
| 373 |
+
<th>Correct Meaning</th>
|
| 374 |
+
<th style={{ width: 40 }} />
|
| 375 |
+
</tr>
|
| 376 |
+
</thead>
|
| 377 |
+
<tbody>
|
| 378 |
+
{gtRows.map((row, i) => (
|
| 379 |
+
<tr key={i}>
|
| 380 |
+
<td>
|
| 381 |
+
<input
|
| 382 |
+
value={row.text}
|
| 383 |
+
onChange={(e) => updateGtRow(i, "text", e.target.value)}
|
| 384 |
+
placeholder="A sentence containing the keyword..."
|
| 385 |
+
style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
|
| 386 |
+
/>
|
| 387 |
+
</td>
|
| 388 |
+
<td>
|
| 389 |
+
<select
|
| 390 |
+
value={row.meaning}
|
| 391 |
+
onChange={(e) => updateGtRow(i, "meaning", e.target.value)}
|
| 392 |
+
style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
|
| 393 |
+
>
|
| 394 |
+
<option value="">Select...</option>
|
| 395 |
+
{meaningLabels.map((label, j) => (
|
| 396 |
+
<option key={j} value={label}>{label}</option>
|
| 397 |
+
))}
|
| 398 |
+
</select>
|
| 399 |
+
</td>
|
| 400 |
+
<td>
|
| 401 |
+
{gtRows.length > 1 && (
|
| 402 |
+
<button className="btn btn-secondary" onClick={() => removeGtRow(i)}>
|
| 403 |
+
×
|
| 404 |
+
</button>
|
| 405 |
+
)}
|
| 406 |
+
</td>
|
| 407 |
+
</tr>
|
| 408 |
+
))}
|
| 409 |
+
</tbody>
|
| 410 |
+
</table>
|
| 411 |
+
<button className="btn btn-secondary mt-1" onClick={addGtRow}>
|
| 412 |
+
+ Add Example
|
| 413 |
+
</button>
|
| 414 |
+
</div>
|
| 415 |
+
|
| 416 |
+
<button
|
| 417 |
+
className="btn btn-primary"
|
| 418 |
+
onClick={runDisambiguation}
|
| 419 |
+
disabled={loading === "disambig"}
|
| 420 |
+
>
|
| 421 |
+
{loading === "disambig" ? "Evaluating..." : "Run Evaluation"}
|
| 422 |
+
</button>
|
| 423 |
+
|
| 424 |
+
{disambig && disambig.map((m) => (
|
| 425 |
+
<div key={m.keyword} className="mt-3">
|
| 426 |
+
<h3>Results: "{m.keyword}" ({m.total_samples} samples)</h3>
|
| 427 |
+
<div className="metric-grid mb-2">
|
| 428 |
+
<MetricCard value={`${(m.accuracy * 100).toFixed(1)}%`} label="Accuracy" />
|
| 429 |
+
<MetricCard value={`${(m.weighted_f1 * 100).toFixed(1)}%`} label="Weighted F1" />
|
| 430 |
+
</div>
|
| 431 |
+
|
| 432 |
+
<h3>Per-Meaning Scores</h3>
|
| 433 |
+
<table className="data-table">
|
| 434 |
+
<thead>
|
| 435 |
+
<tr>
|
| 436 |
+
<th>Meaning</th>
|
| 437 |
+
<th>Precision</th>
|
| 438 |
+
<th>Recall</th>
|
| 439 |
+
<th>F1</th>
|
| 440 |
+
</tr>
|
| 441 |
+
</thead>
|
| 442 |
+
<tbody>
|
| 443 |
+
{Object.keys(m.per_meaning_f1).map((meaning) => (
|
| 444 |
+
<tr key={meaning}>
|
| 445 |
+
<td>{meaning}</td>
|
| 446 |
+
<td>{m.per_meaning_precision[meaning]?.toFixed(4) ?? "-"}</td>
|
| 447 |
+
<td>{m.per_meaning_recall[meaning]?.toFixed(4) ?? "-"}</td>
|
| 448 |
+
<td style={{ fontWeight: 700 }}>{m.per_meaning_f1[meaning]?.toFixed(4) ?? "-"}</td>
|
| 449 |
+
</tr>
|
| 450 |
+
))}
|
| 451 |
+
</tbody>
|
| 452 |
+
</table>
|
| 453 |
+
|
| 454 |
+
{m.confusion_matrix && (
|
| 455 |
+
<>
|
| 456 |
+
<h3 className="mt-2">Confusion Matrix</h3>
|
| 457 |
+
<table className="data-table">
|
| 458 |
+
<thead>
|
| 459 |
+
<tr>
|
| 460 |
+
<th>True \ Predicted</th>
|
| 461 |
+
{Object.keys(m.per_meaning_f1).map((meaning) => (
|
| 462 |
+
<th key={meaning}>{meaning}</th>
|
| 463 |
+
))}
|
| 464 |
+
</tr>
|
| 465 |
+
</thead>
|
| 466 |
+
<tbody>
|
| 467 |
+
{m.confusion_matrix.map((row, i) => (
|
| 468 |
+
<tr key={i}>
|
| 469 |
+
<td style={{ fontWeight: 600 }}>{Object.keys(m.per_meaning_f1)[i]}</td>
|
| 470 |
+
{row.map((val, j) => (
|
| 471 |
+
<td
|
| 472 |
+
key={j}
|
| 473 |
+
style={{
|
| 474 |
+
fontWeight: i === j ? 700 : 400,
|
| 475 |
+
color: i === j ? "var(--ok)" : val > 0 ? "var(--err)" : "var(--text-dim)",
|
| 476 |
+
}}
|
| 477 |
+
>
|
| 478 |
+
{val}
|
| 479 |
+
</td>
|
| 480 |
+
))}
|
| 481 |
+
</tr>
|
| 482 |
+
))}
|
| 483 |
+
</tbody>
|
| 484 |
+
</table>
|
| 485 |
+
</>
|
| 486 |
+
)}
|
| 487 |
+
</div>
|
| 488 |
+
))}
|
| 489 |
+
</div>
|
| 490 |
+
)}
|
| 491 |
+
|
| 492 |
+
{/* ---- Retrieval Evaluation ---- */}
|
| 493 |
+
{section === "retrieval" && (
|
| 494 |
+
<div className="panel">
|
| 495 |
+
<div className="flex-row gap-2 mb-2">
|
| 496 |
+
<button className="btn btn-secondary" onClick={loadRetrievalExample}>
|
| 497 |
+
Load Example
|
| 498 |
+
</button>
|
| 499 |
+
</div>
|
| 500 |
+
|
| 501 |
+
<label className="section-label">
|
| 502 |
+
Search Queries
|
| 503 |
+
<span className="text-dim"> — enter queries and what text they should find</span>
|
| 504 |
+
</label>
|
| 505 |
+
<table className="data-table mb-2">
|
| 506 |
+
<thead>
|
| 507 |
+
<tr>
|
| 508 |
+
<th style={{ width: "50%" }}>Query</th>
|
| 509 |
+
<th>Expected Match (text snippet)</th>
|
| 510 |
+
<th style={{ width: 40 }} />
|
| 511 |
+
</tr>
|
| 512 |
+
</thead>
|
| 513 |
+
<tbody>
|
| 514 |
+
{retRows.map((row, i) => (
|
| 515 |
+
<tr key={i}>
|
| 516 |
+
<td>
|
| 517 |
+
<input
|
| 518 |
+
value={row.query}
|
| 519 |
+
onChange={(e) => updateRetRow(i, "query", e.target.value)}
|
| 520 |
+
placeholder="A search query..."
|
| 521 |
+
style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
|
| 522 |
+
/>
|
| 523 |
+
</td>
|
| 524 |
+
<td>
|
| 525 |
+
<input
|
| 526 |
+
value={row.relevantText}
|
| 527 |
+
onChange={(e) => updateRetRow(i, "relevantText", e.target.value)}
|
| 528 |
+
placeholder="Text that should match..."
|
| 529 |
+
style={{ width: "100%", background: "var(--surface)", border: "1px solid var(--border)", borderRadius: 4, padding: "4px 8px", color: "var(--text)" }}
|
| 530 |
+
/>
|
| 531 |
+
</td>
|
| 532 |
+
<td>
|
| 533 |
+
{retRows.length > 1 && (
|
| 534 |
+
<button className="btn btn-secondary" onClick={() => removeRetRow(i)}>
|
| 535 |
+
×
|
| 536 |
+
</button>
|
| 537 |
+
)}
|
| 538 |
+
</td>
|
| 539 |
+
</tr>
|
| 540 |
+
))}
|
| 541 |
+
</tbody>
|
| 542 |
+
</table>
|
| 543 |
+
<div className="flex-row gap-2 mb-2">
|
| 544 |
+
<button className="btn btn-secondary" onClick={addRetRow}>
|
| 545 |
+
+ Add Query
|
| 546 |
+
</button>
|
| 547 |
+
<button
|
| 548 |
+
className="btn btn-primary"
|
| 549 |
+
onClick={runRetrieval}
|
| 550 |
+
disabled={loading === "retrieval"}
|
| 551 |
+
>
|
| 552 |
+
{loading === "retrieval" ? "Evaluating..." : "Run Evaluation"}
|
| 553 |
+
</button>
|
| 554 |
+
</div>
|
| 555 |
+
|
| 556 |
+
{retrieval && (
|
| 557 |
+
<div className="mt-2">
|
| 558 |
+
<table className="data-table">
|
| 559 |
+
<thead>
|
| 560 |
+
<tr>
|
| 561 |
+
<th>Query</th>
|
| 562 |
+
<th>MRR</th>
|
| 563 |
+
<th>P@1</th>
|
| 564 |
+
<th>P@3</th>
|
| 565 |
+
<th>P@5</th>
|
| 566 |
+
<th>Top Score</th>
|
| 567 |
+
</tr>
|
| 568 |
+
</thead>
|
| 569 |
+
<tbody>
|
| 570 |
+
{retrieval.map((m, i) => (
|
| 571 |
+
<tr key={i}>
|
| 572 |
+
<td style={{ maxWidth: 300 }}>{m.query.length > 50 ? m.query.slice(0, 50) + "..." : m.query}</td>
|
| 573 |
+
<td>{m.mrr.toFixed(3)}</td>
|
| 574 |
+
<td>{m.precision_at_k["1"]?.toFixed(2) ?? "-"}</td>
|
| 575 |
+
<td>{m.precision_at_k["3"]?.toFixed(2) ?? "-"}</td>
|
| 576 |
+
<td>{m.precision_at_k["5"]?.toFixed(2) ?? "-"}</td>
|
| 577 |
+
<td>{m.top_score.toFixed(3)}</td>
|
| 578 |
+
</tr>
|
| 579 |
+
))}
|
| 580 |
+
</tbody>
|
| 581 |
+
</table>
|
| 582 |
+
|
| 583 |
+
<div className="metric-grid mt-3">
|
| 584 |
+
<MetricCard
|
| 585 |
+
value={(retrieval.reduce((s, m) => s + m.mrr, 0) / retrieval.length).toFixed(3)}
|
| 586 |
+
label="Mean MRR"
|
| 587 |
+
/>
|
| 588 |
+
<MetricCard
|
| 589 |
+
value={(retrieval.reduce((s, m) => s + (m.precision_at_k["5"] ?? 0), 0) / retrieval.length).toFixed(3)}
|
| 590 |
+
label="Mean P@5"
|
| 591 |
+
/>
|
| 592 |
+
<MetricCard
|
| 593 |
+
value={(retrieval.reduce((s, m) => s + m.top_score, 0) / retrieval.length).toFixed(3)}
|
| 594 |
+
label="Mean Top Score"
|
| 595 |
+
/>
|
| 596 |
+
</div>
|
| 597 |
+
</div>
|
| 598 |
+
)}
|
| 599 |
+
</div>
|
| 600 |
+
)}
|
| 601 |
+
</div>
|
| 602 |
+
);
|
| 603 |
+
}
|
frontend/src/components/KeywordAnalysis.tsx
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api } from "../api";
|
| 3 |
+
import type { KeywordAnalysisResponse } from "../types";
|
| 4 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
+
import ScoreBar from "./ScoreBar";
|
| 6 |
+
import StatusMessage from "./StatusMessage";
|
| 7 |
+
|
| 8 |
+
export default function KeywordAnalysis() {
|
| 9 |
+
const [keyword, setKeyword] = useState("");
|
| 10 |
+
const [topK, setTopK] = useState(5);
|
| 11 |
+
const [threshold, setThreshold] = useState(0.4);
|
| 12 |
+
const { data: analysis, loading, error, run } = useApiCall<KeywordAnalysisResponse>();
|
| 13 |
+
|
| 14 |
+
async function handleAnalyze() {
|
| 15 |
+
if (!keyword.trim()) return;
|
| 16 |
+
await run(() => api.analyzeKeyword({ keyword, top_k: topK, cluster_threshold: threshold }));
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
return (
|
| 20 |
+
<div>
|
| 21 |
+
<div className="panel">
|
| 22 |
+
<h2>Keyword Analysis</h2>
|
| 23 |
+
<p className="panel-desc">
|
| 24 |
+
Find all occurrences of a keyword, cluster them by contextual meaning,
|
| 25 |
+
and discover semantically similar passages for each meaning.
|
| 26 |
+
</p>
|
| 27 |
+
<div className="form-row">
|
| 28 |
+
<div className="form-group">
|
| 29 |
+
<label>Keyword</label>
|
| 30 |
+
<input
|
| 31 |
+
value={keyword}
|
| 32 |
+
onChange={(e) => setKeyword(e.target.value)}
|
| 33 |
+
placeholder="e.g. pizza"
|
| 34 |
+
onKeyDown={(e) => e.key === "Enter" && handleAnalyze()}
|
| 35 |
+
/>
|
| 36 |
+
</div>
|
| 37 |
+
<div className="form-group form-group-sm">
|
| 38 |
+
<label>Top K</label>
|
| 39 |
+
<input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
|
| 40 |
+
</div>
|
| 41 |
+
<div className="form-group form-group-md">
|
| 42 |
+
<label>Cluster Threshold</label>
|
| 43 |
+
<input type="number" value={threshold} onChange={(e) => setThreshold(+e.target.value)} min={0.1} max={1} step={0.05} />
|
| 44 |
+
</div>
|
| 45 |
+
<div className="form-group form-group-sm">
|
| 46 |
+
<label> </label>
|
| 47 |
+
<button className="btn btn-primary" onClick={handleAnalyze} disabled={loading || !keyword.trim()}>
|
| 48 |
+
{loading ? "Analyzing..." : "Analyze"}
|
| 49 |
+
</button>
|
| 50 |
+
</div>
|
| 51 |
+
</div>
|
| 52 |
+
</div>
|
| 53 |
+
|
| 54 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 55 |
+
|
| 56 |
+
{analysis && (
|
| 57 |
+
<div className="panel">
|
| 58 |
+
<h3>
|
| 59 |
+
"{analysis.keyword}" — {analysis.total_occurrences} occurrence(s),{" "}
|
| 60 |
+
{analysis.meaning_clusters.length} meaning cluster(s)
|
| 61 |
+
</h3>
|
| 62 |
+
|
| 63 |
+
{analysis.meaning_clusters.map((cluster) => (
|
| 64 |
+
<div key={cluster.cluster_id} className="result-card mt-2">
|
| 65 |
+
<div className="result-header">
|
| 66 |
+
<div>
|
| 67 |
+
<strong>Cluster {cluster.cluster_id}</strong>{" "}
|
| 68 |
+
<span className="tag">{cluster.size} occurrence(s)</span>
|
| 69 |
+
</div>
|
| 70 |
+
</div>
|
| 71 |
+
|
| 72 |
+
<div className="mt-1 mb-2">
|
| 73 |
+
<div className="section-label">Contexts:</div>
|
| 74 |
+
{cluster.contexts.map((ctx, i) => (
|
| 75 |
+
<div key={i} className="result-text" style={{ marginBottom: 4, paddingLeft: 12 }}>
|
| 76 |
+
<span className="badge" style={{ marginRight: 6 }}>{ctx.doc_id}</span>
|
| 77 |
+
{ctx.text.slice(0, 200)}...
|
| 78 |
+
</div>
|
| 79 |
+
))}
|
| 80 |
+
</div>
|
| 81 |
+
|
| 82 |
+
<div>
|
| 83 |
+
<div className="section-label">Similar passages:</div>
|
| 84 |
+
{cluster.similar_passages.map((sp) => (
|
| 85 |
+
<div key={sp.rank} className="flex-row" style={{ alignItems: "start", marginBottom: 6 }}>
|
| 86 |
+
<ScoreBar score={sp.score} />
|
| 87 |
+
<span className="result-text" style={{ flex: 1 }}>
|
| 88 |
+
<span className="badge" style={{ marginRight: 4 }}>{sp.doc_id}</span>
|
| 89 |
+
{sp.text.slice(0, 150)}...
|
| 90 |
+
</span>
|
| 91 |
+
</div>
|
| 92 |
+
))}
|
| 93 |
+
</div>
|
| 94 |
+
</div>
|
| 95 |
+
))}
|
| 96 |
+
</div>
|
| 97 |
+
)}
|
| 98 |
+
</div>
|
| 99 |
+
);
|
| 100 |
+
}
|
frontend/src/components/KeywordMatcher.tsx
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { MatchResponse } from "../types";
|
| 4 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
+
import ScoreBar from "./ScoreBar";
|
| 6 |
+
import StatusMessage from "./StatusMessage";
|
| 7 |
+
|
| 8 |
+
export default function KeywordMatcher() {
|
| 9 |
+
const [keyword, setKeyword] = useState("");
|
| 10 |
+
const [meaningsText, setMeaningsText] = useState("");
|
| 11 |
+
const { data: results, loading, error, setError, run } = useApiCall<MatchResponse>();
|
| 12 |
+
|
| 13 |
+
async function handleMatch() {
|
| 14 |
+
if (!keyword.trim() || !meaningsText.trim()) return;
|
| 15 |
+
const candidates = meaningsText.split("\n").map((s) => s.trim()).filter(Boolean);
|
| 16 |
+
if (candidates.length < 2) {
|
| 17 |
+
setError("Provide at least 2 candidate meanings (one per line).");
|
| 18 |
+
return;
|
| 19 |
+
}
|
| 20 |
+
await run(() => api.matchKeyword({ keyword, candidate_meanings: candidates }));
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
return (
|
| 24 |
+
<div>
|
| 25 |
+
<div className="panel">
|
| 26 |
+
<h2>Keyword Meaning Matcher</h2>
|
| 27 |
+
<p className="panel-desc">
|
| 28 |
+
Match each occurrence of a keyword to the most likely intended meaning.
|
| 29 |
+
For example: keyword "pizza" with candidates "food" and "school".
|
| 30 |
+
</p>
|
| 31 |
+
<div className="form-row">
|
| 32 |
+
<div className="form-group form-group-lg">
|
| 33 |
+
<label>Keyword</label>
|
| 34 |
+
<input value={keyword} onChange={(e) => setKeyword(e.target.value)} placeholder="e.g. pizza" />
|
| 35 |
+
</div>
|
| 36 |
+
</div>
|
| 37 |
+
<div className="form-group mb-2">
|
| 38 |
+
<label>Candidate Meanings (one per line)</label>
|
| 39 |
+
<textarea
|
| 40 |
+
value={meaningsText}
|
| 41 |
+
onChange={(e) => setMeaningsText(e.target.value)}
|
| 42 |
+
placeholder={`Italian food made with dough, tomato sauce, and cheese\nSchool, education, and academic activities`}
|
| 43 |
+
rows={4}
|
| 44 |
+
/>
|
| 45 |
+
</div>
|
| 46 |
+
<button className="btn btn-primary" onClick={handleMatch} disabled={loading || !keyword.trim() || !meaningsText.trim()}>
|
| 47 |
+
{loading ? "Matching..." : "Match"}
|
| 48 |
+
</button>
|
| 49 |
+
</div>
|
| 50 |
+
|
| 51 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 52 |
+
|
| 53 |
+
{results && (
|
| 54 |
+
<div className="panel">
|
| 55 |
+
<h3>Matches for "{results.keyword}" ({results.matches.length} occurrences)</h3>
|
| 56 |
+
|
| 57 |
+
{results.matches.map((m, idx) => (
|
| 58 |
+
<div key={idx} className="result-card mt-1">
|
| 59 |
+
<div className="result-header">
|
| 60 |
+
<div>
|
| 61 |
+
<span className="badge">{m.doc_id}</span>{" "}
|
| 62 |
+
<span className="tag">chunk {m.chunk_index}</span>
|
| 63 |
+
</div>
|
| 64 |
+
<span className="tag tag-best">{m.best_match}</span>
|
| 65 |
+
</div>
|
| 66 |
+
<div className="result-text mb-1">{m.text.slice(0, 250)}...</div>
|
| 67 |
+
<div className="flex-row flex-wrap gap-2">
|
| 68 |
+
{Object.entries(m.all_scores).map(([meaning, score]) => (
|
| 69 |
+
<div key={meaning} style={{ flex: "1 1 200px" }}>
|
| 70 |
+
<div
|
| 71 |
+
style={{
|
| 72 |
+
fontSize: "0.78rem",
|
| 73 |
+
color: meaning === m.best_match ? "var(--ok)" : "var(--text-dim)",
|
| 74 |
+
fontWeight: meaning === m.best_match ? 700 : 400,
|
| 75 |
+
marginBottom: 2,
|
| 76 |
+
}}
|
| 77 |
+
>
|
| 78 |
+
{meaning.slice(0, 60)}
|
| 79 |
+
</div>
|
| 80 |
+
<ScoreBar score={score} />
|
| 81 |
+
</div>
|
| 82 |
+
))}
|
| 83 |
+
</div>
|
| 84 |
+
</div>
|
| 85 |
+
))}
|
| 86 |
+
</div>
|
| 87 |
+
)}
|
| 88 |
+
</div>
|
| 89 |
+
);
|
| 90 |
+
}
|
frontend/src/components/LogViewer.tsx
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useEffect, useRef } from "react";
|
| 2 |
+
|
| 3 |
+
interface Props {
|
| 4 |
+
/** Whether to actively stream logs */
|
| 5 |
+
active: boolean;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
export default function LogViewer({ active }: Props) {
|
| 9 |
+
const [lines, setLines] = useState<string[]>([]);
|
| 10 |
+
const containerRef = useRef<HTMLDivElement>(null);
|
| 11 |
+
|
| 12 |
+
useEffect(() => {
|
| 13 |
+
if (!active) return;
|
| 14 |
+
|
| 15 |
+
setLines([]);
|
| 16 |
+
const evtSource = new EventSource("/api/logs/stream");
|
| 17 |
+
|
| 18 |
+
evtSource.onmessage = (event) => {
|
| 19 |
+
setLines((prev) => {
|
| 20 |
+
const next = [...prev, event.data];
|
| 21 |
+
// Keep last 200 lines
|
| 22 |
+
return next.length > 200 ? next.slice(-200) : next;
|
| 23 |
+
});
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
evtSource.onerror = () => {
|
| 27 |
+
// SSE will auto-reconnect, no action needed
|
| 28 |
+
};
|
| 29 |
+
|
| 30 |
+
return () => {
|
| 31 |
+
evtSource.close();
|
| 32 |
+
};
|
| 33 |
+
}, [active]);
|
| 34 |
+
|
| 35 |
+
useEffect(() => {
|
| 36 |
+
// Auto-scroll to bottom
|
| 37 |
+
if (containerRef.current) {
|
| 38 |
+
containerRef.current.scrollTop = containerRef.current.scrollHeight;
|
| 39 |
+
}
|
| 40 |
+
}, [lines]);
|
| 41 |
+
|
| 42 |
+
if (!active && lines.length === 0) return null;
|
| 43 |
+
|
| 44 |
+
return (
|
| 45 |
+
<div
|
| 46 |
+
ref={containerRef}
|
| 47 |
+
style={{
|
| 48 |
+
background: "#0a0c10",
|
| 49 |
+
border: "1px solid var(--border)",
|
| 50 |
+
borderRadius: "var(--radius)",
|
| 51 |
+
padding: "10px 14px",
|
| 52 |
+
marginTop: 12,
|
| 53 |
+
maxHeight: 220,
|
| 54 |
+
overflowY: "auto",
|
| 55 |
+
fontFamily: "'JetBrains Mono', 'Fira Code', 'Consolas', monospace",
|
| 56 |
+
fontSize: "0.75rem",
|
| 57 |
+
lineHeight: 1.7,
|
| 58 |
+
color: "var(--text-dim)",
|
| 59 |
+
}}
|
| 60 |
+
>
|
| 61 |
+
{lines.length === 0 && active && (
|
| 62 |
+
<span style={{ color: "var(--text-dim)", opacity: 0.5 }}>Waiting for logs...</span>
|
| 63 |
+
)}
|
| 64 |
+
{lines.map((line, i) => (
|
| 65 |
+
<div key={i} style={{ whiteSpace: "pre-wrap", wordBreak: "break-all" }}>
|
| 66 |
+
{line}
|
| 67 |
+
</div>
|
| 68 |
+
))}
|
| 69 |
+
</div>
|
| 70 |
+
);
|
| 71 |
+
}
|
frontend/src/components/MetricCard.tsx
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
interface Props {
|
| 2 |
+
value: string | number;
|
| 3 |
+
label: string;
|
| 4 |
+
valueColor?: string;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
export default function MetricCard({ value, label, valueColor }: Props) {
|
| 8 |
+
return (
|
| 9 |
+
<div className="metric-card">
|
| 10 |
+
<div className="metric-value" style={valueColor ? { color: valueColor } : undefined}>
|
| 11 |
+
{value}
|
| 12 |
+
</div>
|
| 13 |
+
<div className="metric-label">{label}</div>
|
| 14 |
+
</div>
|
| 15 |
+
);
|
| 16 |
+
}
|
frontend/src/components/ScoreBar.tsx
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { scoreColor } from "../utils/colors";
|
| 2 |
+
|
| 3 |
+
interface ScoreBarProps {
|
| 4 |
+
score: number;
|
| 5 |
+
max?: number;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
export default function ScoreBar({ score, max = 1 }: ScoreBarProps) {
|
| 9 |
+
const pct = Math.min(100, Math.max(0, (score / max) * 100));
|
| 10 |
+
const color = scoreColor(score);
|
| 11 |
+
return (
|
| 12 |
+
<div className="score-bar-container">
|
| 13 |
+
<div className="score-bar">
|
| 14 |
+
<div className="score-bar-fill" style={{ width: `${pct}%`, background: color }} />
|
| 15 |
+
</div>
|
| 16 |
+
<span className="score-label" style={{ color }}>{score.toFixed(4)}</span>
|
| 17 |
+
</div>
|
| 18 |
+
);
|
| 19 |
+
}
|
frontend/src/components/Select.tsx
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useRef, useEffect } from "react";
|
| 2 |
+
|
| 3 |
+
interface Option {
|
| 4 |
+
value: string;
|
| 5 |
+
label: string;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
interface Props {
|
| 9 |
+
options: Option[];
|
| 10 |
+
value: string;
|
| 11 |
+
onChange: (value: string) => void;
|
| 12 |
+
placeholder?: string;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
export default function Select({ options, value, onChange, placeholder }: Props) {
|
| 16 |
+
const [open, setOpen] = useState(false);
|
| 17 |
+
const ref = useRef<HTMLDivElement>(null);
|
| 18 |
+
|
| 19 |
+
useEffect(() => {
|
| 20 |
+
function handleClickOutside(e: MouseEvent) {
|
| 21 |
+
if (ref.current && !ref.current.contains(e.target as Node)) {
|
| 22 |
+
setOpen(false);
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
document.addEventListener("mousedown", handleClickOutside);
|
| 26 |
+
return () => document.removeEventListener("mousedown", handleClickOutside);
|
| 27 |
+
}, []);
|
| 28 |
+
|
| 29 |
+
const selected = options.find((o) => o.value === value);
|
| 30 |
+
|
| 31 |
+
return (
|
| 32 |
+
<div className="custom-select" ref={ref}>
|
| 33 |
+
<button
|
| 34 |
+
className="custom-select-trigger"
|
| 35 |
+
onClick={() => setOpen(!open)}
|
| 36 |
+
type="button"
|
| 37 |
+
>
|
| 38 |
+
<span>{selected?.label || placeholder || "Select..."}</span>
|
| 39 |
+
<span className="custom-select-arrow">{open ? "\u25b4" : "\u25be"}</span>
|
| 40 |
+
</button>
|
| 41 |
+
{open && (
|
| 42 |
+
<div className="custom-select-dropdown">
|
| 43 |
+
{options.map((opt) => (
|
| 44 |
+
<button
|
| 45 |
+
key={opt.value}
|
| 46 |
+
className={`custom-select-option ${opt.value === value ? "custom-select-option-active" : ""}`}
|
| 47 |
+
onClick={() => {
|
| 48 |
+
onChange(opt.value);
|
| 49 |
+
setOpen(false);
|
| 50 |
+
}}
|
| 51 |
+
type="button"
|
| 52 |
+
>
|
| 53 |
+
{opt.label}
|
| 54 |
+
</button>
|
| 55 |
+
))}
|
| 56 |
+
</div>
|
| 57 |
+
)}
|
| 58 |
+
</div>
|
| 59 |
+
);
|
| 60 |
+
}
|
frontend/src/components/SemanticSearch.tsx
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api } from "../api";
|
| 3 |
+
import type { QueryResultItem } from "../types";
|
| 4 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 5 |
+
import ScoreBar from "./ScoreBar";
|
| 6 |
+
import StatusMessage from "./StatusMessage";
|
| 7 |
+
|
| 8 |
+
export default function SemanticSearch() {
|
| 9 |
+
const [query, setQuery] = useState("");
|
| 10 |
+
const [topK, setTopK] = useState(10);
|
| 11 |
+
const { data: results, loading, error, run } = useApiCall<QueryResultItem[]>();
|
| 12 |
+
|
| 13 |
+
async function handleSearch() {
|
| 14 |
+
if (!query.trim()) return;
|
| 15 |
+
await run(() => api.query({ text: query, top_k: topK }).then((r) => r.results));
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
return (
|
| 19 |
+
<div>
|
| 20 |
+
<div className="panel">
|
| 21 |
+
<h2>Semantic Search</h2>
|
| 22 |
+
<p className="panel-desc">
|
| 23 |
+
Find passages most semantically similar to your query across the entire corpus.
|
| 24 |
+
</p>
|
| 25 |
+
<div className="form-row">
|
| 26 |
+
<div className="form-group">
|
| 27 |
+
<label>Query</label>
|
| 28 |
+
<input
|
| 29 |
+
value={query}
|
| 30 |
+
onChange={(e) => setQuery(e.target.value)}
|
| 31 |
+
placeholder="e.g. a place where children learn and take tests"
|
| 32 |
+
onKeyDown={(e) => e.key === "Enter" && handleSearch()}
|
| 33 |
+
/>
|
| 34 |
+
</div>
|
| 35 |
+
<div className="form-group form-group-sm">
|
| 36 |
+
<label>Top K</label>
|
| 37 |
+
<input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
|
| 38 |
+
</div>
|
| 39 |
+
<div className="form-group form-group-sm">
|
| 40 |
+
<label> </label>
|
| 41 |
+
<button className="btn btn-primary" onClick={handleSearch} disabled={loading || !query.trim()}>
|
| 42 |
+
{loading ? "Searching..." : "Search"}
|
| 43 |
+
</button>
|
| 44 |
+
</div>
|
| 45 |
+
</div>
|
| 46 |
+
</div>
|
| 47 |
+
|
| 48 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 49 |
+
|
| 50 |
+
{results && (
|
| 51 |
+
<div className="panel">
|
| 52 |
+
<h3>Results ({results.length})</h3>
|
| 53 |
+
{results.map((r) => (
|
| 54 |
+
<div key={`${r.doc_id}-${r.chunk_index}`} className="result-card">
|
| 55 |
+
<div className="result-header">
|
| 56 |
+
<div>
|
| 57 |
+
<span className="badge">#{r.rank}</span>{" "}
|
| 58 |
+
<span className="badge">{r.doc_id}</span>{" "}
|
| 59 |
+
<span className="tag">chunk {r.chunk_index}</span>
|
| 60 |
+
</div>
|
| 61 |
+
<ScoreBar score={r.score} />
|
| 62 |
+
</div>
|
| 63 |
+
<div className="result-text">{r.text}</div>
|
| 64 |
+
</div>
|
| 65 |
+
))}
|
| 66 |
+
</div>
|
| 67 |
+
)}
|
| 68 |
+
</div>
|
| 69 |
+
);
|
| 70 |
+
}
|
frontend/src/components/SimilarWords.tsx
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api } from "../api";
|
| 3 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 4 |
+
import ScoreBar from "./ScoreBar";
|
| 5 |
+
import StatusMessage from "./StatusMessage";
|
| 6 |
+
|
| 7 |
+
interface SimilarWord {
|
| 8 |
+
word: string;
|
| 9 |
+
score: number;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
export default function SimilarWords() {
|
| 13 |
+
const [word, setWord] = useState("");
|
| 14 |
+
const [topK, setTopK] = useState(10);
|
| 15 |
+
const { data: results, loading, error, run } = useApiCall<SimilarWord[]>();
|
| 16 |
+
|
| 17 |
+
async function handleSearch() {
|
| 18 |
+
if (!word.trim()) return;
|
| 19 |
+
await run(() => api.similarWords({ word: word.trim(), top_k: topK }).then((r) => r.similar));
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
return (
|
| 23 |
+
<div>
|
| 24 |
+
<div className="panel">
|
| 25 |
+
<h2>Similar Words</h2>
|
| 26 |
+
<p className="panel-desc">
|
| 27 |
+
Find words that appear in similar contexts using transformer embeddings.
|
| 28 |
+
Unlike Word2Vec (static, one vector per word), this uses the model's contextual understanding.
|
| 29 |
+
</p>
|
| 30 |
+
<div className="form-row">
|
| 31 |
+
<div className="form-group">
|
| 32 |
+
<label>Word</label>
|
| 33 |
+
<input
|
| 34 |
+
value={word}
|
| 35 |
+
onChange={(e) => setWord(e.target.value)}
|
| 36 |
+
onKeyDown={(e) => e.key === "Enter" && handleSearch()}
|
| 37 |
+
placeholder="e.g. Epstein, flight, island"
|
| 38 |
+
/>
|
| 39 |
+
</div>
|
| 40 |
+
<div className="form-group form-group-sm">
|
| 41 |
+
<label>Top K</label>
|
| 42 |
+
<input type="number" value={topK} onChange={(e) => setTopK(+e.target.value)} min={1} max={50} />
|
| 43 |
+
</div>
|
| 44 |
+
<div className="form-group form-group-sm">
|
| 45 |
+
<label> </label>
|
| 46 |
+
<button className="btn btn-primary" onClick={handleSearch} disabled={loading || !word.trim()}>
|
| 47 |
+
{loading ? "Searching..." : "Find"}
|
| 48 |
+
</button>
|
| 49 |
+
</div>
|
| 50 |
+
</div>
|
| 51 |
+
</div>
|
| 52 |
+
|
| 53 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 54 |
+
|
| 55 |
+
{results && results.length > 0 && (
|
| 56 |
+
<div className="panel">
|
| 57 |
+
<h3>Words similar to "{word}" ({results.length})</h3>
|
| 58 |
+
<table className="data-table">
|
| 59 |
+
<thead>
|
| 60 |
+
<tr><th>Word</th><th>Similarity</th></tr>
|
| 61 |
+
</thead>
|
| 62 |
+
<tbody>
|
| 63 |
+
{results.map((r, i) => (
|
| 64 |
+
<tr key={i}>
|
| 65 |
+
<td style={{ fontWeight: 600 }}>{r.word}</td>
|
| 66 |
+
<td><ScoreBar score={r.score} /></td>
|
| 67 |
+
</tr>
|
| 68 |
+
))}
|
| 69 |
+
</tbody>
|
| 70 |
+
</table>
|
| 71 |
+
</div>
|
| 72 |
+
)}
|
| 73 |
+
</div>
|
| 74 |
+
);
|
| 75 |
+
}
|
frontend/src/components/StatusMessage.tsx
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
interface StatusMessageProps {
|
| 2 |
+
type: "ok" | "err" | "loading";
|
| 3 |
+
message: string;
|
| 4 |
+
}
|
| 5 |
+
|
| 6 |
+
export default function StatusMessage({ type, message }: StatusMessageProps) {
|
| 7 |
+
return (
|
| 8 |
+
<div className={`status status-${type}`}>
|
| 9 |
+
{type === "loading" && <span className="spinner" />}
|
| 10 |
+
{message}
|
| 11 |
+
</div>
|
| 12 |
+
);
|
| 13 |
+
}
|
frontend/src/components/Switch.tsx
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
interface Props {
|
| 2 |
+
checked: boolean;
|
| 3 |
+
onChange: (checked: boolean) => void;
|
| 4 |
+
label?: string;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
export default function Switch({ checked, onChange, label }: Props) {
|
| 8 |
+
return (
|
| 9 |
+
<label className="switch">
|
| 10 |
+
<button
|
| 11 |
+
className={`switch-track ${checked ? "switch-track-on" : ""}`}
|
| 12 |
+
onClick={() => onChange(!checked)}
|
| 13 |
+
type="button"
|
| 14 |
+
role="switch"
|
| 15 |
+
aria-checked={checked}
|
| 16 |
+
>
|
| 17 |
+
<span className="switch-thumb" />
|
| 18 |
+
</button>
|
| 19 |
+
{label && <span className="switch-label">{label}</span>}
|
| 20 |
+
</label>
|
| 21 |
+
);
|
| 22 |
+
}
|
frontend/src/components/TextCompare.tsx
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api } from "../api";
|
| 3 |
+
import { useApiCall } from "../hooks/useApiCall";
|
| 4 |
+
import { scoreColor } from "../utils/colors";
|
| 5 |
+
import StatusMessage from "./StatusMessage";
|
| 6 |
+
|
| 7 |
+
export default function TextCompare() {
|
| 8 |
+
const [textA, setTextA] = useState("");
|
| 9 |
+
const [textB, setTextB] = useState("");
|
| 10 |
+
const { data: similarity, loading, error, run } = useApiCall<number>();
|
| 11 |
+
|
| 12 |
+
async function handleCompare() {
|
| 13 |
+
if (!textA.trim() || !textB.trim()) return;
|
| 14 |
+
await run(() => api.compare({ text_a: textA, text_b: textB }).then((r) => r.similarity));
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
return (
|
| 18 |
+
<div>
|
| 19 |
+
<div className="panel">
|
| 20 |
+
<h2>Compare Texts</h2>
|
| 21 |
+
<p className="panel-desc">
|
| 22 |
+
Compute cosine similarity between two sentences/passages.
|
| 23 |
+
</p>
|
| 24 |
+
<div className="grid-2">
|
| 25 |
+
<div className="form-group">
|
| 26 |
+
<label>Text A</label>
|
| 27 |
+
<textarea
|
| 28 |
+
value={textA}
|
| 29 |
+
onChange={(e) => setTextA(e.target.value)}
|
| 30 |
+
placeholder="I love pizza so much"
|
| 31 |
+
rows={4}
|
| 32 |
+
/>
|
| 33 |
+
</div>
|
| 34 |
+
<div className="form-group">
|
| 35 |
+
<label>Text B</label>
|
| 36 |
+
<textarea
|
| 37 |
+
value={textB}
|
| 38 |
+
onChange={(e) => setTextB(e.target.value)}
|
| 39 |
+
placeholder="I love school so much"
|
| 40 |
+
rows={4}
|
| 41 |
+
/>
|
| 42 |
+
</div>
|
| 43 |
+
</div>
|
| 44 |
+
<div className="mt-2">
|
| 45 |
+
<button
|
| 46 |
+
className="btn btn-primary"
|
| 47 |
+
onClick={handleCompare}
|
| 48 |
+
disabled={loading || !textA.trim() || !textB.trim()}
|
| 49 |
+
>
|
| 50 |
+
{loading ? "Computing..." : "Compare"}
|
| 51 |
+
</button>
|
| 52 |
+
</div>
|
| 53 |
+
</div>
|
| 54 |
+
|
| 55 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 56 |
+
|
| 57 |
+
{similarity !== null && (
|
| 58 |
+
<div className="panel">
|
| 59 |
+
<div className="similarity-gauge">
|
| 60 |
+
<div className="similarity-value" style={{ color: scoreColor(similarity) }}>
|
| 61 |
+
{similarity.toFixed(4)}
|
| 62 |
+
</div>
|
| 63 |
+
<div className="similarity-label">Cosine Similarity</div>
|
| 64 |
+
<div style={{ width: "100%", maxWidth: 400, marginTop: 16 }}>
|
| 65 |
+
<div className="score-bar" style={{ width: "100%", height: 12 }}>
|
| 66 |
+
<div
|
| 67 |
+
className="score-bar-fill"
|
| 68 |
+
style={{
|
| 69 |
+
width: `${Math.max(0, similarity) * 100}%`,
|
| 70 |
+
background: scoreColor(similarity),
|
| 71 |
+
}}
|
| 72 |
+
/>
|
| 73 |
+
</div>
|
| 74 |
+
<div className="score-bar-legend">
|
| 75 |
+
<span>0 (unrelated)</span>
|
| 76 |
+
<span>1 (identical)</span>
|
| 77 |
+
</div>
|
| 78 |
+
</div>
|
| 79 |
+
</div>
|
| 80 |
+
</div>
|
| 81 |
+
)}
|
| 82 |
+
</div>
|
| 83 |
+
);
|
| 84 |
+
}
|
frontend/src/components/Toggle.tsx
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
interface Option {
|
| 2 |
+
value: string;
|
| 3 |
+
label: string;
|
| 4 |
+
}
|
| 5 |
+
|
| 6 |
+
interface Props {
|
| 7 |
+
options: Option[];
|
| 8 |
+
value: string;
|
| 9 |
+
onChange: (value: string) => void;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
export default function Toggle({ options, value, onChange }: Props) {
|
| 13 |
+
return (
|
| 14 |
+
<div className="toggle">
|
| 15 |
+
{options.map((opt) => (
|
| 16 |
+
<button
|
| 17 |
+
key={opt.value}
|
| 18 |
+
className={`toggle-option ${opt.value === value ? "toggle-option-active" : ""}`}
|
| 19 |
+
onClick={() => onChange(opt.value)}
|
| 20 |
+
type="button"
|
| 21 |
+
>
|
| 22 |
+
{opt.label}
|
| 23 |
+
</button>
|
| 24 |
+
))}
|
| 25 |
+
</div>
|
| 26 |
+
);
|
| 27 |
+
}
|
frontend/src/components/TrainingPanel.tsx
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { TrainResponse, QueryResultItem, CompareResponse } from "../types";
|
| 4 |
+
import { useCorpusLoader } from "../hooks/useCorpusLoader";
|
| 5 |
+
import { scoreColor } from "../utils/colors";
|
| 6 |
+
import ScoreBar from "./ScoreBar";
|
| 7 |
+
import StatusMessage from "./StatusMessage";
|
| 8 |
+
import MetricCard from "./MetricCard";
|
| 9 |
+
import Toggle from "./Toggle";
|
| 10 |
+
import Select from "./Select";
|
| 11 |
+
import LogViewer from "./LogViewer";
|
| 12 |
+
|
| 13 |
+
type Strategy = "unsupervised" | "contrastive" | "keywords";
|
| 14 |
+
|
| 15 |
+
interface SimilarWord {
|
| 16 |
+
word: string;
|
| 17 |
+
score: number;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
const STRATEGIES: { id: Strategy; label: string; desc: string }[] = [
|
| 21 |
+
{ id: "unsupervised", label: "Unsupervised", desc: "Soft-label domain adaptation. Samples random pairs and fine-tunes using the model's own similarity scores." },
|
| 22 |
+
{ id: "contrastive", label: "Contrastive", desc: "Adjacent sentences = positive pairs. Learns document structure with in-batch negatives and validation." },
|
| 23 |
+
{ id: "keywords", label: "Keyword-supervised", desc: "You provide keyword→meaning map. Best if you know the code words." },
|
| 24 |
+
];
|
| 25 |
+
|
| 26 |
+
const MODELS = [
|
| 27 |
+
{ value: "all-MiniLM-L6-v2", label: "all-MiniLM-L6-v2 (fast)" },
|
| 28 |
+
{ value: "all-mpnet-base-v2", label: "all-mpnet-base-v2 (best quality)" },
|
| 29 |
+
];
|
| 30 |
+
|
| 31 |
+
export default function TrainingPanel() {
|
| 32 |
+
// Training
|
| 33 |
+
const [strategy, setStrategy] = useState<Strategy>("contrastive");
|
| 34 |
+
const [baseModel, setBaseModel] = useState("all-MiniLM-L6-v2");
|
| 35 |
+
const [outputPath, setOutputPath] = useState("./trained_model");
|
| 36 |
+
const [epochs, setEpochs] = useState(5);
|
| 37 |
+
const [batchSize, setBatchSize] = useState(16);
|
| 38 |
+
const [keywordMapText, setKeywordMapText] = useState('{\n "pizza": "school",\n "pepperoni": "math class"\n}');
|
| 39 |
+
const [showAdvanced, setShowAdvanced] = useState(false);
|
| 40 |
+
const [training, setTraining] = useState(false);
|
| 41 |
+
const [result, setResult] = useState<TrainResponse | null>(null);
|
| 42 |
+
|
| 43 |
+
const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
|
| 44 |
+
|
| 45 |
+
// Similar words
|
| 46 |
+
const [simWord, setSimWord] = useState("");
|
| 47 |
+
const [simTopK, setSimTopK] = useState(10);
|
| 48 |
+
const [simResults, setSimResults] = useState<SimilarWord[]>([]);
|
| 49 |
+
const [simLoading, setSimLoading] = useState(false);
|
| 50 |
+
|
| 51 |
+
// Compare
|
| 52 |
+
const [compTextA, setCompTextA] = useState("");
|
| 53 |
+
const [compTextB, setCompTextB] = useState("");
|
| 54 |
+
const [compResult, setCompResult] = useState<CompareResponse | null>(null);
|
| 55 |
+
const [compLoading, setCompLoading] = useState(false);
|
| 56 |
+
|
| 57 |
+
// Search
|
| 58 |
+
const [queryText, setQueryText] = useState("");
|
| 59 |
+
const [queryTopK, setQueryTopK] = useState(5);
|
| 60 |
+
const [queryResults, setQueryResults] = useState<QueryResultItem[]>([]);
|
| 61 |
+
const [queryLoading, setQueryLoading] = useState(false);
|
| 62 |
+
|
| 63 |
+
const ready = result !== null;
|
| 64 |
+
|
| 65 |
+
async function handleTrain() {
|
| 66 |
+
setTraining(true); setError(""); setResult(null);
|
| 67 |
+
try {
|
| 68 |
+
const corpus = parseCorpus();
|
| 69 |
+
if (!corpus.length) { setError("Corpus is empty."); setTraining(false); return; }
|
| 70 |
+
|
| 71 |
+
const base = { corpus_texts: corpus, base_model: baseModel, output_path: outputPath, epochs, batch_size: batchSize };
|
| 72 |
+
let res: TrainResponse;
|
| 73 |
+
|
| 74 |
+
if (strategy === "unsupervised") {
|
| 75 |
+
res = await api.trainUnsupervised(base);
|
| 76 |
+
} else if (strategy === "contrastive") {
|
| 77 |
+
res = await api.trainContrastive(base);
|
| 78 |
+
} else {
|
| 79 |
+
const kw = JSON.parse(keywordMapText);
|
| 80 |
+
res = await api.trainKeywords({ ...base, keyword_meanings: kw });
|
| 81 |
+
}
|
| 82 |
+
setResult(res);
|
| 83 |
+
} catch (e) {
|
| 84 |
+
setError(e instanceof SyntaxError ? "Invalid JSON in keyword map." : getErrorMessage(e));
|
| 85 |
+
} finally {
|
| 86 |
+
setTraining(false);
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
async function handleSimilarWords() {
|
| 91 |
+
setSimLoading(true); setError("");
|
| 92 |
+
try {
|
| 93 |
+
const res = await api.similarWords({ word: simWord, top_k: simTopK });
|
| 94 |
+
setSimResults(res.similar);
|
| 95 |
+
} catch (err) {
|
| 96 |
+
setError(getErrorMessage(err));
|
| 97 |
+
} finally {
|
| 98 |
+
setSimLoading(false);
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
async function handleCompare() {
|
| 103 |
+
setCompLoading(true); setError("");
|
| 104 |
+
try {
|
| 105 |
+
const res = await api.compare({ text_a: compTextA, text_b: compTextB });
|
| 106 |
+
setCompResult(res);
|
| 107 |
+
} catch (err) {
|
| 108 |
+
setError(getErrorMessage(err));
|
| 109 |
+
} finally {
|
| 110 |
+
setCompLoading(false);
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
async function handleQuery() {
|
| 115 |
+
setQueryLoading(true); setError("");
|
| 116 |
+
try {
|
| 117 |
+
const res = await api.query({ text: queryText, top_k: queryTopK });
|
| 118 |
+
setQueryResults(res.results);
|
| 119 |
+
} catch (err) {
|
| 120 |
+
setError(getErrorMessage(err));
|
| 121 |
+
} finally {
|
| 122 |
+
setQueryLoading(false);
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
return (
|
| 127 |
+
<div>
|
| 128 |
+
{/* 1. Training (strategy + config + corpus merged) */}
|
| 129 |
+
<div className="panel">
|
| 130 |
+
<h2>1. Fine-tune Transformer</h2>
|
| 131 |
+
<p className="panel-desc">
|
| 132 |
+
Fine-tune a pre-trained sentence transformer on your corpus to improve contextual understanding.
|
| 133 |
+
</p>
|
| 134 |
+
|
| 135 |
+
<div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
|
| 136 |
+
<button className="btn btn-secondary" onClick={loadFromEngine}
|
| 137 |
+
disabled={corpusLoading}>
|
| 138 |
+
{corpusLoading ? "Loading..." : "Load from Engine"}
|
| 139 |
+
</button>
|
| 140 |
+
{corpusText && (
|
| 141 |
+
<button className="btn btn-secondary" onClick={() => setCorpusText("")}>
|
| 142 |
+
Clear
|
| 143 |
+
</button>
|
| 144 |
+
)}
|
| 145 |
+
</div>
|
| 146 |
+
<div className="form-group" style={{ marginBottom: 12 }}>
|
| 147 |
+
<label>
|
| 148 |
+
Corpus (separate documents with blank lines)
|
| 149 |
+
{corpusText && (
|
| 150 |
+
<span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
|
| 151 |
+
{" "} — {parseCorpus().length} documents detected
|
| 152 |
+
</span>
|
| 153 |
+
)}
|
| 154 |
+
</label>
|
| 155 |
+
<textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
|
| 156 |
+
placeholder="Document 1 text...\n\nDocument 2 text..." />
|
| 157 |
+
</div>
|
| 158 |
+
|
| 159 |
+
<label className="section-label">Strategy</label>
|
| 160 |
+
<Toggle
|
| 161 |
+
options={STRATEGIES.map(s => ({ value: s.id, label: s.label }))}
|
| 162 |
+
value={strategy}
|
| 163 |
+
onChange={(v) => setStrategy(v as Strategy)}
|
| 164 |
+
/>
|
| 165 |
+
<p style={{ color: "var(--text-dim)", fontSize: "0.85rem", marginBottom: 12 }}>
|
| 166 |
+
{STRATEGIES.find(s => s.id === strategy)?.desc}
|
| 167 |
+
</p>
|
| 168 |
+
|
| 169 |
+
{strategy === "keywords" && (
|
| 170 |
+
<div className="form-group" style={{ marginBottom: 12 }}>
|
| 171 |
+
<label>Keyword → Meaning Map (JSON)</label>
|
| 172 |
+
<textarea value={keywordMapText} onChange={e => setKeywordMapText(e.target.value)}
|
| 173 |
+
rows={4} style={{ fontFamily: "monospace", fontSize: "0.8rem" }} />
|
| 174 |
+
</div>
|
| 175 |
+
)}
|
| 176 |
+
|
| 177 |
+
<div className="form-row" style={{ marginBottom: 12 }}>
|
| 178 |
+
<div className="form-group">
|
| 179 |
+
<label>Base Model</label>
|
| 180 |
+
<Select options={MODELS} value={baseModel} onChange={setBaseModel} />
|
| 181 |
+
</div>
|
| 182 |
+
</div>
|
| 183 |
+
|
| 184 |
+
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
|
| 185 |
+
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
|
| 186 |
+
</button>
|
| 187 |
+
|
| 188 |
+
{showAdvanced && (
|
| 189 |
+
<div className="advanced-section">
|
| 190 |
+
<div className="form-row">
|
| 191 |
+
<div className="form-group" style={{ maxWidth: 100 }}>
|
| 192 |
+
<label>Epochs</label>
|
| 193 |
+
<input type="number" value={epochs} onChange={e => setEpochs(+e.target.value)} min={1} max={50} />
|
| 194 |
+
</div>
|
| 195 |
+
<div className="form-group" style={{ maxWidth: 120 }}>
|
| 196 |
+
<label>Batch Size</label>
|
| 197 |
+
<input type="number" value={batchSize} onChange={e => setBatchSize(+e.target.value)} min={4} max={128} />
|
| 198 |
+
</div>
|
| 199 |
+
<div className="form-group" style={{ maxWidth: 200 }}>
|
| 200 |
+
<label>Output Path</label>
|
| 201 |
+
<input value={outputPath} onChange={e => setOutputPath(e.target.value)} />
|
| 202 |
+
</div>
|
| 203 |
+
</div>
|
| 204 |
+
</div>
|
| 205 |
+
)}
|
| 206 |
+
|
| 207 |
+
<button className="btn btn-primary" onClick={handleTrain}
|
| 208 |
+
disabled={training || !corpusText.trim()} style={{ marginTop: 8 }}>
|
| 209 |
+
{training ? <><span className="spinner" /> Training...</> : "Start Training"}
|
| 210 |
+
</button>
|
| 211 |
+
|
| 212 |
+
<LogViewer active={training} />
|
| 213 |
+
</div>
|
| 214 |
+
|
| 215 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 216 |
+
|
| 217 |
+
{result && (
|
| 218 |
+
<div className="panel">
|
| 219 |
+
<h2>Training Complete</h2>
|
| 220 |
+
<div className="metric-grid">
|
| 221 |
+
<MetricCard value={result.training_pairs} label="Training Pairs" />
|
| 222 |
+
<MetricCard value={result.epochs} label="Epochs" />
|
| 223 |
+
<MetricCard value={`${result.seconds}s`} label="Time" />
|
| 224 |
+
</div>
|
| 225 |
+
<StatusMessage type="ok"
|
| 226 |
+
message={`Model saved: ${result.model_path} — use this path in the Setup tab.`} />
|
| 227 |
+
</div>
|
| 228 |
+
)}
|
| 229 |
+
|
| 230 |
+
{/* 2. Similar Words */}
|
| 231 |
+
<div className="panel">
|
| 232 |
+
<h2>2. Similar Words</h2>
|
| 233 |
+
<p className="panel-desc">
|
| 234 |
+
Find words that appear in similar contexts using transformer embeddings.
|
| 235 |
+
</p>
|
| 236 |
+
<div className="form-row">
|
| 237 |
+
<div className="form-group">
|
| 238 |
+
<label>Word</label>
|
| 239 |
+
<input value={simWord} onChange={e => setSimWord(e.target.value)}
|
| 240 |
+
onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
|
| 241 |
+
placeholder="e.g. pizza" />
|
| 242 |
+
</div>
|
| 243 |
+
<div className="form-group form-group-sm">
|
| 244 |
+
<label>Top K</label>
|
| 245 |
+
<input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
|
| 246 |
+
</div>
|
| 247 |
+
<div className="form-group form-group-sm">
|
| 248 |
+
<label> </label>
|
| 249 |
+
<button className="btn btn-primary" onClick={handleSimilarWords}
|
| 250 |
+
disabled={simLoading || !simWord.trim()}>
|
| 251 |
+
{simLoading ? "Searching..." : "Find"}
|
| 252 |
+
</button>
|
| 253 |
+
</div>
|
| 254 |
+
</div>
|
| 255 |
+
|
| 256 |
+
{simResults.length > 0 && (
|
| 257 |
+
<table className="data-table" style={{ marginTop: 12 }}>
|
| 258 |
+
<thead>
|
| 259 |
+
<tr><th>Word</th><th>Similarity</th></tr>
|
| 260 |
+
</thead>
|
| 261 |
+
<tbody>
|
| 262 |
+
{simResults.map((r, i) => (
|
| 263 |
+
<tr key={i}>
|
| 264 |
+
<td style={{ fontWeight: 600 }}>{r.word}</td>
|
| 265 |
+
<td><ScoreBar score={r.score} /></td>
|
| 266 |
+
</tr>
|
| 267 |
+
))}
|
| 268 |
+
</tbody>
|
| 269 |
+
</table>
|
| 270 |
+
)}
|
| 271 |
+
</div>
|
| 272 |
+
|
| 273 |
+
{/* 3. Compare Texts */}
|
| 274 |
+
<div className="panel">
|
| 275 |
+
<h2>3. Compare Texts</h2>
|
| 276 |
+
<p className="panel-desc">
|
| 277 |
+
Sentence similarity via transformer contextual embeddings.
|
| 278 |
+
</p>
|
| 279 |
+
<div className="form-row">
|
| 280 |
+
<div className="form-group">
|
| 281 |
+
<label>Text A</label>
|
| 282 |
+
<input value={compTextA} onChange={e => setCompTextA(e.target.value)}
|
| 283 |
+
placeholder="pizza gives me homework" />
|
| 284 |
+
</div>
|
| 285 |
+
<div className="form-group">
|
| 286 |
+
<label>Text B</label>
|
| 287 |
+
<input value={compTextB} onChange={e => setCompTextB(e.target.value)}
|
| 288 |
+
placeholder="school gives me homework" />
|
| 289 |
+
</div>
|
| 290 |
+
</div>
|
| 291 |
+
<button className="btn btn-primary" onClick={handleCompare}
|
| 292 |
+
disabled={compLoading || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
|
| 293 |
+
{compLoading ? "Comparing..." : "Compare"}
|
| 294 |
+
</button>
|
| 295 |
+
|
| 296 |
+
{compResult && (
|
| 297 |
+
<div className="similarity-gauge" style={{ marginTop: 16 }}>
|
| 298 |
+
<div className="similarity-value"
|
| 299 |
+
style={{ color: scoreColor(compResult.similarity) }}>
|
| 300 |
+
{compResult.similarity.toFixed(4)}
|
| 301 |
+
</div>
|
| 302 |
+
<div className="similarity-label">Transformer Cosine Similarity</div>
|
| 303 |
+
</div>
|
| 304 |
+
)}
|
| 305 |
+
</div>
|
| 306 |
+
|
| 307 |
+
{/* 4. Semantic Search */}
|
| 308 |
+
<div className="panel">
|
| 309 |
+
<h2>4. Semantic Search</h2>
|
| 310 |
+
<p className="panel-desc">
|
| 311 |
+
Search your corpus using transformer embeddings.
|
| 312 |
+
</p>
|
| 313 |
+
<div className="form-row">
|
| 314 |
+
<div className="form-group">
|
| 315 |
+
<label>Query</label>
|
| 316 |
+
<input value={queryText} onChange={e => setQueryText(e.target.value)}
|
| 317 |
+
onKeyDown={e => e.key === "Enter" && handleQuery()}
|
| 318 |
+
placeholder="a place where children learn" />
|
| 319 |
+
</div>
|
| 320 |
+
<div className="form-group form-group-sm">
|
| 321 |
+
<label>Top K</label>
|
| 322 |
+
<input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
|
| 323 |
+
</div>
|
| 324 |
+
<div className="form-group form-group-sm">
|
| 325 |
+
<label> </label>
|
| 326 |
+
<button className="btn btn-primary" onClick={handleQuery}
|
| 327 |
+
disabled={queryLoading || !queryText.trim()}>
|
| 328 |
+
{queryLoading ? "Searching..." : "Search"}
|
| 329 |
+
</button>
|
| 330 |
+
</div>
|
| 331 |
+
</div>
|
| 332 |
+
|
| 333 |
+
{queryResults.length > 0 && (
|
| 334 |
+
<div style={{ marginTop: 12 }}>
|
| 335 |
+
{queryResults.map((r, i) => (
|
| 336 |
+
<div key={i} className="result-card">
|
| 337 |
+
<div className="result-header">
|
| 338 |
+
<span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
|
| 339 |
+
<ScoreBar score={r.score} />
|
| 340 |
+
</div>
|
| 341 |
+
<div className="result-text">{r.text}</div>
|
| 342 |
+
</div>
|
| 343 |
+
))}
|
| 344 |
+
</div>
|
| 345 |
+
)}
|
| 346 |
+
</div>
|
| 347 |
+
</div>
|
| 348 |
+
);
|
| 349 |
+
}
|
frontend/src/components/Word2VecPanel.tsx
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
import type { W2VInitResponse, W2VQueryResult, W2VSimilarWord, CompareResponse } from "../types";
|
| 4 |
+
import { useCorpusLoader } from "../hooks/useCorpusLoader";
|
| 5 |
+
import { scoreColor } from "../utils/colors";
|
| 6 |
+
import ScoreBar from "./ScoreBar";
|
| 7 |
+
import StatusMessage from "./StatusMessage";
|
| 8 |
+
import LogViewer from "./LogViewer";
|
| 9 |
+
import MetricCard from "./MetricCard";
|
| 10 |
+
|
| 11 |
+
export default function Word2VecPanel() {
|
| 12 |
+
// Init
|
| 13 |
+
const [vectorSize, setVectorSize] = useState(100);
|
| 14 |
+
const [windowSize, setWindowSize] = useState(5);
|
| 15 |
+
const [w2vEpochs, setW2vEpochs] = useState(50);
|
| 16 |
+
const [showAdvanced, setShowAdvanced] = useState(false);
|
| 17 |
+
const [initLoading, setInitLoading] = useState(false);
|
| 18 |
+
const [initResult, setInitResult] = useState<W2VInitResponse | null>(null);
|
| 19 |
+
|
| 20 |
+
const { corpusText, setCorpusText, loading: corpusLoading, error, setError, parseCorpus, loadFromEngine } = useCorpusLoader();
|
| 21 |
+
|
| 22 |
+
// Similar words
|
| 23 |
+
const [simWord, setSimWord] = useState("");
|
| 24 |
+
const [simTopK, setSimTopK] = useState(10);
|
| 25 |
+
const [simResults, setSimResults] = useState<W2VSimilarWord[]>([]);
|
| 26 |
+
const [simLoading, setSimLoading] = useState(false);
|
| 27 |
+
|
| 28 |
+
// Compare
|
| 29 |
+
const [compTextA, setCompTextA] = useState("");
|
| 30 |
+
const [compTextB, setCompTextB] = useState("");
|
| 31 |
+
const [compResult, setCompResult] = useState<CompareResponse | null>(null);
|
| 32 |
+
const [compLoading, setCompLoading] = useState(false);
|
| 33 |
+
|
| 34 |
+
// Query
|
| 35 |
+
const [queryText, setQueryText] = useState("");
|
| 36 |
+
const [queryTopK, setQueryTopK] = useState(5);
|
| 37 |
+
const [queryResults, setQueryResults] = useState<W2VQueryResult[]>([]);
|
| 38 |
+
const [queryLoading, setQueryLoading] = useState(false);
|
| 39 |
+
|
| 40 |
+
async function handleInit() {
|
| 41 |
+
setInitLoading(true); setError(""); setInitResult(null);
|
| 42 |
+
try {
|
| 43 |
+
const corpus = parseCorpus();
|
| 44 |
+
if (!corpus.length) { setError("Corpus is empty."); setInitLoading(false); return; }
|
| 45 |
+
const res = await api.w2vInit({
|
| 46 |
+
corpus_texts: corpus,
|
| 47 |
+
vector_size: vectorSize,
|
| 48 |
+
window: windowSize,
|
| 49 |
+
epochs: w2vEpochs,
|
| 50 |
+
});
|
| 51 |
+
setInitResult(res);
|
| 52 |
+
} catch (err) {
|
| 53 |
+
setError(getErrorMessage(err));
|
| 54 |
+
} finally {
|
| 55 |
+
setInitLoading(false);
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
async function handleSimilarWords() {
|
| 60 |
+
setSimLoading(true); setError("");
|
| 61 |
+
try {
|
| 62 |
+
const res = await api.w2vSimilarWords({ word: simWord, top_k: simTopK });
|
| 63 |
+
setSimResults(res.similar);
|
| 64 |
+
} catch (err) {
|
| 65 |
+
setError(getErrorMessage(err));
|
| 66 |
+
} finally {
|
| 67 |
+
setSimLoading(false);
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
async function handleCompare() {
|
| 72 |
+
setCompLoading(true); setError("");
|
| 73 |
+
try {
|
| 74 |
+
const res = await api.w2vCompare({ text_a: compTextA, text_b: compTextB });
|
| 75 |
+
setCompResult(res);
|
| 76 |
+
} catch (err) {
|
| 77 |
+
setError(getErrorMessage(err));
|
| 78 |
+
} finally {
|
| 79 |
+
setCompLoading(false);
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
async function handleQuery() {
|
| 84 |
+
setQueryLoading(true); setError("");
|
| 85 |
+
try {
|
| 86 |
+
const res = await api.w2vQuery({ text: queryText, top_k: queryTopK });
|
| 87 |
+
setQueryResults(res.results);
|
| 88 |
+
} catch (err) {
|
| 89 |
+
setError(getErrorMessage(err));
|
| 90 |
+
} finally {
|
| 91 |
+
setQueryLoading(false);
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
const ready = initResult !== null;
|
| 96 |
+
|
| 97 |
+
return (
|
| 98 |
+
<div>
|
| 99 |
+
{/* 1. Training */}
|
| 100 |
+
<div className="panel">
|
| 101 |
+
<h2>1. Train Word2Vec (gensim)</h2>
|
| 102 |
+
<p className="panel-desc">
|
| 103 |
+
Static embeddings — one vector per word, no context awareness.
|
| 104 |
+
Useful as a baseline to compare against the transformer approach.
|
| 105 |
+
</p>
|
| 106 |
+
<div style={{ display: "flex", gap: 8, marginBottom: 10 }}>
|
| 107 |
+
<button className="btn btn-secondary" onClick={loadFromEngine}
|
| 108 |
+
disabled={corpusLoading}>
|
| 109 |
+
{corpusLoading ? "Loading..." : "Load from Engine"}
|
| 110 |
+
</button>
|
| 111 |
+
{corpusText && (
|
| 112 |
+
<button className="btn btn-secondary" onClick={() => setCorpusText("")}>
|
| 113 |
+
Clear
|
| 114 |
+
</button>
|
| 115 |
+
)}
|
| 116 |
+
</div>
|
| 117 |
+
<div className="form-group" style={{ marginBottom: 12 }}>
|
| 118 |
+
<label>
|
| 119 |
+
Corpus (separate documents with blank lines)
|
| 120 |
+
{corpusText && (
|
| 121 |
+
<span style={{ color: "var(--text-dim)", fontWeight: 400 }}>
|
| 122 |
+
{" "} — {parseCorpus().length} documents detected
|
| 123 |
+
</span>
|
| 124 |
+
)}
|
| 125 |
+
</label>
|
| 126 |
+
<textarea value={corpusText} onChange={e => setCorpusText(e.target.value)} rows={8}
|
| 127 |
+
placeholder="Document 1 text...\n\nDocument 2 text..." />
|
| 128 |
+
</div>
|
| 129 |
+
<button className="advanced-toggle" onClick={() => setShowAdvanced(!showAdvanced)}>
|
| 130 |
+
{showAdvanced ? "\u25be" : "\u25b8"} Advanced Settings
|
| 131 |
+
</button>
|
| 132 |
+
|
| 133 |
+
{showAdvanced && (
|
| 134 |
+
<div className="advanced-section">
|
| 135 |
+
<div className="form-row">
|
| 136 |
+
<div className="form-group" style={{ maxWidth: 120 }}>
|
| 137 |
+
<label>Vector Size</label>
|
| 138 |
+
<input type="number" value={vectorSize} onChange={e => setVectorSize(+e.target.value)} min={50} max={300} />
|
| 139 |
+
</div>
|
| 140 |
+
<div className="form-group" style={{ maxWidth: 120 }}>
|
| 141 |
+
<label>Window</label>
|
| 142 |
+
<input type="number" value={windowSize} onChange={e => setWindowSize(+e.target.value)} min={2} max={15} />
|
| 143 |
+
</div>
|
| 144 |
+
<div className="form-group" style={{ maxWidth: 120 }}>
|
| 145 |
+
<label>Epochs</label>
|
| 146 |
+
<input type="number" value={w2vEpochs} onChange={e => setW2vEpochs(+e.target.value)} min={5} max={200} />
|
| 147 |
+
</div>
|
| 148 |
+
</div>
|
| 149 |
+
</div>
|
| 150 |
+
)}
|
| 151 |
+
|
| 152 |
+
<button className="btn btn-primary" onClick={handleInit}
|
| 153 |
+
disabled={initLoading || !corpusText.trim()} style={{ marginTop: 8 }}>
|
| 154 |
+
{initLoading ? <><span className="spinner" /> Training...</> : "Train Word2Vec"}
|
| 155 |
+
</button>
|
| 156 |
+
|
| 157 |
+
<LogViewer active={initLoading} />
|
| 158 |
+
</div>
|
| 159 |
+
|
| 160 |
+
{error && <StatusMessage type="err" message={error} />}
|
| 161 |
+
|
| 162 |
+
{initResult && (
|
| 163 |
+
<div className="panel">
|
| 164 |
+
<h2>Word2Vec Ready</h2>
|
| 165 |
+
<div className="metric-grid">
|
| 166 |
+
<MetricCard value={initResult.vocab_size} label="Vocabulary" />
|
| 167 |
+
<MetricCard value={initResult.sentences} label="Sentences" />
|
| 168 |
+
<MetricCard value={initResult.vector_size} label="Dimensions" />
|
| 169 |
+
<MetricCard value={`${initResult.seconds}s`} label="Time" />
|
| 170 |
+
</div>
|
| 171 |
+
</div>
|
| 172 |
+
)}
|
| 173 |
+
|
| 174 |
+
{/* 2. Similar Words */}
|
| 175 |
+
<div className="panel">
|
| 176 |
+
<h2>2. Similar Words</h2>
|
| 177 |
+
<p className="panel-desc">
|
| 178 |
+
Find words that appear in similar contexts using Word2Vec static embeddings.
|
| 179 |
+
</p>
|
| 180 |
+
<div className="form-row">
|
| 181 |
+
<div className="form-group">
|
| 182 |
+
<label>Word</label>
|
| 183 |
+
<input value={simWord} onChange={e => setSimWord(e.target.value)}
|
| 184 |
+
onKeyDown={e => e.key === "Enter" && handleSimilarWords()}
|
| 185 |
+
placeholder="e.g. pizza" />
|
| 186 |
+
</div>
|
| 187 |
+
<div className="form-group form-group-sm">
|
| 188 |
+
<label>Top K</label>
|
| 189 |
+
<input type="number" value={simTopK} onChange={e => setSimTopK(+e.target.value)} min={1} max={50} />
|
| 190 |
+
</div>
|
| 191 |
+
<div className="form-group form-group-sm">
|
| 192 |
+
<label> </label>
|
| 193 |
+
<button className="btn btn-primary" onClick={handleSimilarWords}
|
| 194 |
+
disabled={simLoading || !ready || !simWord.trim()}>
|
| 195 |
+
{simLoading ? "Searching..." : "Find"}
|
| 196 |
+
</button>
|
| 197 |
+
</div>
|
| 198 |
+
</div>
|
| 199 |
+
|
| 200 |
+
{simResults.length > 0 && (
|
| 201 |
+
<table className="data-table" style={{ marginTop: 12 }}>
|
| 202 |
+
<thead>
|
| 203 |
+
<tr><th>Word</th><th>Similarity</th></tr>
|
| 204 |
+
</thead>
|
| 205 |
+
<tbody>
|
| 206 |
+
{simResults.map((r, i) => (
|
| 207 |
+
<tr key={i}>
|
| 208 |
+
<td style={{ fontWeight: 600 }}>{r.word}</td>
|
| 209 |
+
<td><ScoreBar score={r.score} /></td>
|
| 210 |
+
</tr>
|
| 211 |
+
))}
|
| 212 |
+
</tbody>
|
| 213 |
+
</table>
|
| 214 |
+
)}
|
| 215 |
+
</div>
|
| 216 |
+
|
| 217 |
+
{/* 3. Compare Texts */}
|
| 218 |
+
<div className="panel">
|
| 219 |
+
<h2>3. Compare Texts</h2>
|
| 220 |
+
<p className="panel-desc">
|
| 221 |
+
Sentence similarity via averaged word vectors.
|
| 222 |
+
</p>
|
| 223 |
+
<div className="form-row">
|
| 224 |
+
<div className="form-group">
|
| 225 |
+
<label>Text A</label>
|
| 226 |
+
<input value={compTextA} onChange={e => setCompTextA(e.target.value)}
|
| 227 |
+
placeholder="pizza gives me homework" />
|
| 228 |
+
</div>
|
| 229 |
+
<div className="form-group">
|
| 230 |
+
<label>Text B</label>
|
| 231 |
+
<input value={compTextB} onChange={e => setCompTextB(e.target.value)}
|
| 232 |
+
placeholder="school gives me homework" />
|
| 233 |
+
</div>
|
| 234 |
+
</div>
|
| 235 |
+
<button className="btn btn-primary" onClick={handleCompare}
|
| 236 |
+
disabled={compLoading || !ready || !compTextA.trim() || !compTextB.trim()} style={{ marginTop: 8 }}>
|
| 237 |
+
{compLoading ? "Comparing..." : "Compare"}
|
| 238 |
+
</button>
|
| 239 |
+
|
| 240 |
+
{compResult && (
|
| 241 |
+
<div className="similarity-gauge" style={{ marginTop: 16 }}>
|
| 242 |
+
<div className="similarity-value"
|
| 243 |
+
style={{ color: scoreColor(compResult.similarity) }}>
|
| 244 |
+
{compResult.similarity.toFixed(4)}
|
| 245 |
+
</div>
|
| 246 |
+
<div className="similarity-label">Word2Vec Cosine Similarity</div>
|
| 247 |
+
</div>
|
| 248 |
+
)}
|
| 249 |
+
</div>
|
| 250 |
+
|
| 251 |
+
{/* 4. Semantic Search */}
|
| 252 |
+
<div className="panel">
|
| 253 |
+
<h2>4. Semantic Search</h2>
|
| 254 |
+
<p className="panel-desc">
|
| 255 |
+
Search your corpus using averaged Word2Vec vectors.
|
| 256 |
+
</p>
|
| 257 |
+
<div className="form-row">
|
| 258 |
+
<div className="form-group">
|
| 259 |
+
<label>Query</label>
|
| 260 |
+
<input value={queryText} onChange={e => setQueryText(e.target.value)}
|
| 261 |
+
onKeyDown={e => e.key === "Enter" && handleQuery()}
|
| 262 |
+
placeholder="a place where children learn" />
|
| 263 |
+
</div>
|
| 264 |
+
<div className="form-group form-group-sm">
|
| 265 |
+
<label>Top K</label>
|
| 266 |
+
<input type="number" value={queryTopK} onChange={e => setQueryTopK(+e.target.value)} min={1} max={20} />
|
| 267 |
+
</div>
|
| 268 |
+
<div className="form-group form-group-sm">
|
| 269 |
+
<label> </label>
|
| 270 |
+
<button className="btn btn-primary" onClick={handleQuery}
|
| 271 |
+
disabled={queryLoading || !ready || !queryText.trim()}>
|
| 272 |
+
{queryLoading ? "Searching..." : "Search"}
|
| 273 |
+
</button>
|
| 274 |
+
</div>
|
| 275 |
+
</div>
|
| 276 |
+
|
| 277 |
+
{queryResults.length > 0 && (
|
| 278 |
+
<div style={{ marginTop: 12 }}>
|
| 279 |
+
{queryResults.map((r, i) => (
|
| 280 |
+
<div key={i} className="result-card">
|
| 281 |
+
<div className="result-header">
|
| 282 |
+
<span>#{r.rank} <span className="tag">{r.doc_id}</span></span>
|
| 283 |
+
<ScoreBar score={r.score} />
|
| 284 |
+
</div>
|
| 285 |
+
<div className="result-text">{r.text}</div>
|
| 286 |
+
</div>
|
| 287 |
+
))}
|
| 288 |
+
</div>
|
| 289 |
+
)}
|
| 290 |
+
</div>
|
| 291 |
+
</div>
|
| 292 |
+
);
|
| 293 |
+
}
|
frontend/src/hooks/useApiCall.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useCallback } from "react";
|
| 2 |
+
import { getErrorMessage } from "../api";
|
| 3 |
+
|
| 4 |
+
/**
|
| 5 |
+
* Generic hook for API calls with loading/error/result state.
|
| 6 |
+
* Eliminates the repeated try/catch/setLoading/setError pattern.
|
| 7 |
+
*/
|
| 8 |
+
export function useApiCall<T>() {
|
| 9 |
+
const [data, setData] = useState<T | null>(null);
|
| 10 |
+
const [loading, setLoading] = useState(false);
|
| 11 |
+
const [error, setError] = useState("");
|
| 12 |
+
|
| 13 |
+
const run = useCallback(async (fn: () => Promise<T>): Promise<T | null> => {
|
| 14 |
+
setLoading(true);
|
| 15 |
+
setError("");
|
| 16 |
+
try {
|
| 17 |
+
const result = await fn();
|
| 18 |
+
setData(result);
|
| 19 |
+
return result;
|
| 20 |
+
} catch (err) {
|
| 21 |
+
setError(getErrorMessage(err));
|
| 22 |
+
return null;
|
| 23 |
+
} finally {
|
| 24 |
+
setLoading(false);
|
| 25 |
+
}
|
| 26 |
+
}, []);
|
| 27 |
+
|
| 28 |
+
const clear = useCallback(() => {
|
| 29 |
+
setData(null);
|
| 30 |
+
setError("");
|
| 31 |
+
}, []);
|
| 32 |
+
|
| 33 |
+
return { data, loading, error, setError, run, clear };
|
| 34 |
+
}
|
frontend/src/hooks/useCorpusLoader.ts
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { api, getErrorMessage } from "../api";
|
| 3 |
+
|
| 4 |
+
/**
|
| 5 |
+
* Shared hook for loading corpus text from the engine and parsing it into documents.
|
| 6 |
+
* Used by both TrainingPanel and Word2VecPanel.
|
| 7 |
+
*/
|
| 8 |
+
export function useCorpusLoader() {
|
| 9 |
+
const [corpusText, setCorpusText] = useState("");
|
| 10 |
+
const [loading, setLoading] = useState(false);
|
| 11 |
+
const [error, setError] = useState("");
|
| 12 |
+
|
| 13 |
+
function parseCorpus(): string[] {
|
| 14 |
+
return corpusText
|
| 15 |
+
.split(/\n{2,}/)
|
| 16 |
+
.map((t) => t.trim())
|
| 17 |
+
.filter((t) => t.length > 20);
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
async function loadFromEngine() {
|
| 21 |
+
setLoading(true);
|
| 22 |
+
setError("");
|
| 23 |
+
try {
|
| 24 |
+
const res = await api.getCorpusTexts();
|
| 25 |
+
if (res.documents.length === 0) {
|
| 26 |
+
setError("No documents loaded in the engine. Load a dataset first.");
|
| 27 |
+
return;
|
| 28 |
+
}
|
| 29 |
+
setCorpusText(
|
| 30 |
+
res.documents.map((d: { doc_id: string; text: string }) => d.text).join("\n\n")
|
| 31 |
+
);
|
| 32 |
+
} catch (e) {
|
| 33 |
+
setError(getErrorMessage(e));
|
| 34 |
+
} finally {
|
| 35 |
+
setLoading(false);
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
return {
|
| 40 |
+
corpusText,
|
| 41 |
+
setCorpusText,
|
| 42 |
+
loading,
|
| 43 |
+
error,
|
| 44 |
+
setError,
|
| 45 |
+
parseCorpus,
|
| 46 |
+
loadFromEngine,
|
| 47 |
+
};
|
| 48 |
+
}
|
frontend/src/main.tsx
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { StrictMode } from "react";
|
| 2 |
+
import { createRoot } from "react-dom/client";
|
| 3 |
+
import App from "./App";
|
| 4 |
+
|
| 5 |
+
createRoot(document.getElementById("root")!).render(
|
| 6 |
+
<StrictMode>
|
| 7 |
+
<App />
|
| 8 |
+
</StrictMode>
|
| 9 |
+
);
|
frontend/src/styles.css
ADDED
|
@@ -0,0 +1,828 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ---- Reset & Base ---- */
|
| 2 |
+
*,
|
| 3 |
+
*::before,
|
| 4 |
+
*::after {
|
| 5 |
+
box-sizing: border-box;
|
| 6 |
+
margin: 0;
|
| 7 |
+
padding: 0;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
:root {
|
| 11 |
+
--bg: #0f1117;
|
| 12 |
+
--surface: #1a1d27;
|
| 13 |
+
--surface2: #232733;
|
| 14 |
+
--border: #2e3340;
|
| 15 |
+
--text: #e1e4eb;
|
| 16 |
+
--text-dim: #8b90a0;
|
| 17 |
+
--accent: #6c8cff;
|
| 18 |
+
--accent-dim: #4a64cc;
|
| 19 |
+
--ok: #4ade80;
|
| 20 |
+
--warn: #facc15;
|
| 21 |
+
--err: #f87171;
|
| 22 |
+
--radius: 8px;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
body {
|
| 26 |
+
font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
| 27 |
+
background: var(--bg);
|
| 28 |
+
color: var(--text);
|
| 29 |
+
line-height: 1.6;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/* ---- App Layout ---- */
|
| 33 |
+
.app {
|
| 34 |
+
max-width: 1200px;
|
| 35 |
+
margin: 0 auto;
|
| 36 |
+
padding: 24px;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
.app-header {
|
| 40 |
+
display: flex;
|
| 41 |
+
justify-content: space-between;
|
| 42 |
+
align-items: center;
|
| 43 |
+
flex-wrap: wrap;
|
| 44 |
+
gap: 12px;
|
| 45 |
+
margin-bottom: 24px;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
.app-header h1 {
|
| 49 |
+
font-size: 1.5rem;
|
| 50 |
+
font-weight: 700;
|
| 51 |
+
color: var(--accent);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.header-stats {
|
| 55 |
+
display: flex;
|
| 56 |
+
gap: 8px;
|
| 57 |
+
flex-wrap: wrap;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
/* ---- Badges ---- */
|
| 61 |
+
.badge {
|
| 62 |
+
padding: 4px 10px;
|
| 63 |
+
border-radius: 12px;
|
| 64 |
+
font-size: 0.75rem;
|
| 65 |
+
font-weight: 600;
|
| 66 |
+
background: var(--surface2);
|
| 67 |
+
color: var(--text-dim);
|
| 68 |
+
}
|
| 69 |
+
.badge-ok {
|
| 70 |
+
background: #1a3a2a;
|
| 71 |
+
color: var(--ok);
|
| 72 |
+
}
|
| 73 |
+
.badge-warn {
|
| 74 |
+
background: #3a3520;
|
| 75 |
+
color: var(--warn);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/* ---- Progress Stepper ---- */
|
| 79 |
+
.stepper {
|
| 80 |
+
display: flex;
|
| 81 |
+
align-items: flex-start;
|
| 82 |
+
justify-content: center;
|
| 83 |
+
margin-bottom: 28px;
|
| 84 |
+
padding: 0 24px;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
.stepper-item {
|
| 88 |
+
display: flex;
|
| 89 |
+
flex-direction: column;
|
| 90 |
+
align-items: center;
|
| 91 |
+
gap: 6px;
|
| 92 |
+
position: relative;
|
| 93 |
+
z-index: 1;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.stepper-line {
|
| 97 |
+
flex: 1;
|
| 98 |
+
height: 2px;
|
| 99 |
+
background: var(--border);
|
| 100 |
+
margin-top: 16px;
|
| 101 |
+
min-width: 40px;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.stepper-line-active {
|
| 105 |
+
background: var(--accent-dim);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.stepper-circle {
|
| 109 |
+
width: 34px;
|
| 110 |
+
height: 34px;
|
| 111 |
+
border-radius: 50%;
|
| 112 |
+
border: 2px solid var(--border);
|
| 113 |
+
background: var(--surface);
|
| 114 |
+
color: var(--text-dim);
|
| 115 |
+
font-weight: 700;
|
| 116 |
+
font-size: 0.85rem;
|
| 117 |
+
cursor: pointer;
|
| 118 |
+
display: flex;
|
| 119 |
+
align-items: center;
|
| 120 |
+
justify-content: center;
|
| 121 |
+
transition: all 0.2s;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
.stepper-circle:hover:not(:disabled) {
|
| 125 |
+
border-color: var(--accent);
|
| 126 |
+
color: var(--accent);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.stepper-circle.stepper-active {
|
| 130 |
+
border-color: var(--accent);
|
| 131 |
+
background: var(--accent);
|
| 132 |
+
color: #fff;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
.stepper-circle.stepper-done {
|
| 136 |
+
border-color: var(--ok);
|
| 137 |
+
background: #1a3a2a;
|
| 138 |
+
color: var(--ok);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.stepper-circle:disabled {
|
| 142 |
+
opacity: 0.35;
|
| 143 |
+
cursor: not-allowed;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
.stepper-label {
|
| 147 |
+
font-size: 0.75rem;
|
| 148 |
+
color: var(--text-dim);
|
| 149 |
+
white-space: nowrap;
|
| 150 |
+
font-weight: 500;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.stepper-label-active {
|
| 154 |
+
color: var(--accent);
|
| 155 |
+
font-weight: 600;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
/* ---- Sub-tabs ---- */
|
| 159 |
+
.subtabs {
|
| 160 |
+
display: flex;
|
| 161 |
+
gap: 2px;
|
| 162 |
+
background: var(--surface);
|
| 163 |
+
border: 1px solid var(--border);
|
| 164 |
+
border-radius: var(--radius);
|
| 165 |
+
padding: 3px;
|
| 166 |
+
margin-bottom: 20px;
|
| 167 |
+
overflow-x: auto;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.subtab {
|
| 171 |
+
padding: 7px 16px;
|
| 172 |
+
background: none;
|
| 173 |
+
border: none;
|
| 174 |
+
border-radius: 6px;
|
| 175 |
+
color: var(--text-dim);
|
| 176 |
+
cursor: pointer;
|
| 177 |
+
font-size: 0.8rem;
|
| 178 |
+
font-weight: 500;
|
| 179 |
+
white-space: nowrap;
|
| 180 |
+
transition: all 0.15s;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
.subtab:hover {
|
| 184 |
+
color: var(--text);
|
| 185 |
+
background: var(--surface2);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
.subtab-active {
|
| 189 |
+
color: #fff;
|
| 190 |
+
background: var(--accent);
|
| 191 |
+
font-weight: 600;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
/* ---- Collapsible Toggle ---- */
|
| 195 |
+
.collapsible-toggle {
|
| 196 |
+
display: flex;
|
| 197 |
+
align-items: center;
|
| 198 |
+
gap: 8px;
|
| 199 |
+
width: 100%;
|
| 200 |
+
padding: 14px 16px;
|
| 201 |
+
margin: 16px 0;
|
| 202 |
+
background: var(--surface);
|
| 203 |
+
border: 1px solid var(--border);
|
| 204 |
+
border-radius: var(--radius);
|
| 205 |
+
color: var(--text-dim);
|
| 206 |
+
font-size: 0.85rem;
|
| 207 |
+
font-weight: 500;
|
| 208 |
+
cursor: pointer;
|
| 209 |
+
transition: color 0.15s, border-color 0.15s;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.collapsible-toggle:hover {
|
| 213 |
+
color: var(--text);
|
| 214 |
+
border-color: var(--accent-dim);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.collapsible-arrow {
|
| 218 |
+
font-size: 0.75rem;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
/* ---- Advanced Settings Toggle ---- */
|
| 222 |
+
.advanced-toggle {
|
| 223 |
+
display: flex;
|
| 224 |
+
align-items: center;
|
| 225 |
+
gap: 6px;
|
| 226 |
+
padding: 0;
|
| 227 |
+
margin: 12px 0 0;
|
| 228 |
+
background: none;
|
| 229 |
+
border: none;
|
| 230 |
+
color: var(--text-dim);
|
| 231 |
+
font-size: 0.8rem;
|
| 232 |
+
font-weight: 500;
|
| 233 |
+
cursor: pointer;
|
| 234 |
+
transition: color 0.15s;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
.advanced-toggle:hover {
|
| 238 |
+
color: var(--accent);
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.advanced-section {
|
| 242 |
+
padding-top: 8px;
|
| 243 |
+
margin-bottom: 12px;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
/* ---- Toggle (segmented control) ---- */
|
| 247 |
+
.toggle {
|
| 248 |
+
display: inline-flex;
|
| 249 |
+
gap: 2px;
|
| 250 |
+
background: var(--bg);
|
| 251 |
+
border: 1px solid var(--border);
|
| 252 |
+
border-radius: var(--radius);
|
| 253 |
+
padding: 3px;
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
.toggle-option {
|
| 257 |
+
padding: 6px 14px;
|
| 258 |
+
background: none;
|
| 259 |
+
border: none;
|
| 260 |
+
border-radius: 6px;
|
| 261 |
+
color: var(--text-dim);
|
| 262 |
+
font-size: 0.8rem;
|
| 263 |
+
font-weight: 500;
|
| 264 |
+
cursor: pointer;
|
| 265 |
+
transition: all 0.15s;
|
| 266 |
+
white-space: nowrap;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
.toggle-option:hover {
|
| 270 |
+
color: var(--text);
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
.toggle-option-active {
|
| 274 |
+
background: var(--accent);
|
| 275 |
+
color: #fff;
|
| 276 |
+
font-weight: 600;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
/* ---- Switch (on/off) ---- */
|
| 280 |
+
.switch {
|
| 281 |
+
display: inline-flex;
|
| 282 |
+
align-items: center;
|
| 283 |
+
gap: 8px;
|
| 284 |
+
cursor: pointer;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.switch-track {
|
| 288 |
+
position: relative;
|
| 289 |
+
width: 40px;
|
| 290 |
+
height: 22px;
|
| 291 |
+
border-radius: 11px;
|
| 292 |
+
background: var(--border);
|
| 293 |
+
border: none;
|
| 294 |
+
cursor: pointer;
|
| 295 |
+
padding: 0;
|
| 296 |
+
transition: background 0.2s;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
.switch-track-on {
|
| 300 |
+
background: var(--accent);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
.switch-thumb {
|
| 304 |
+
position: absolute;
|
| 305 |
+
top: 2px;
|
| 306 |
+
left: 2px;
|
| 307 |
+
width: 18px;
|
| 308 |
+
height: 18px;
|
| 309 |
+
border-radius: 50%;
|
| 310 |
+
background: #fff;
|
| 311 |
+
transition: transform 0.2s;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
.switch-track-on .switch-thumb {
|
| 315 |
+
transform: translateX(18px);
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
.switch-label {
|
| 319 |
+
font-size: 0.8rem;
|
| 320 |
+
color: var(--text-dim);
|
| 321 |
+
font-weight: 500;
|
| 322 |
+
user-select: none;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
/* ---- Custom Select ---- */
|
| 326 |
+
.custom-select {
|
| 327 |
+
position: relative;
|
| 328 |
+
min-width: 180px;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
.custom-select-trigger {
|
| 332 |
+
display: flex;
|
| 333 |
+
align-items: center;
|
| 334 |
+
justify-content: space-between;
|
| 335 |
+
width: 100%;
|
| 336 |
+
padding: 8px 12px;
|
| 337 |
+
background: var(--bg);
|
| 338 |
+
border: 1px solid var(--border);
|
| 339 |
+
border-radius: var(--radius);
|
| 340 |
+
color: var(--text);
|
| 341 |
+
font-size: 0.875rem;
|
| 342 |
+
font-family: inherit;
|
| 343 |
+
cursor: pointer;
|
| 344 |
+
transition: border-color 0.15s;
|
| 345 |
+
text-align: left;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
.custom-select-trigger:hover,
|
| 349 |
+
.custom-select-trigger:focus {
|
| 350 |
+
border-color: var(--accent);
|
| 351 |
+
outline: none;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
.custom-select-arrow {
|
| 355 |
+
font-size: 0.7rem;
|
| 356 |
+
color: var(--text-dim);
|
| 357 |
+
margin-left: 8px;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
.custom-select-dropdown {
|
| 361 |
+
position: absolute;
|
| 362 |
+
top: calc(100% + 4px);
|
| 363 |
+
left: 0;
|
| 364 |
+
right: 0;
|
| 365 |
+
background: var(--surface);
|
| 366 |
+
border: 1px solid var(--border);
|
| 367 |
+
border-radius: var(--radius);
|
| 368 |
+
padding: 4px;
|
| 369 |
+
z-index: 100;
|
| 370 |
+
max-height: 240px;
|
| 371 |
+
overflow-y: auto;
|
| 372 |
+
box-shadow: 0 8px 24px rgba(0, 0, 0, 0.4);
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
.custom-select-option {
|
| 376 |
+
display: block;
|
| 377 |
+
width: 100%;
|
| 378 |
+
padding: 8px 10px;
|
| 379 |
+
background: none;
|
| 380 |
+
border: none;
|
| 381 |
+
border-radius: 6px;
|
| 382 |
+
color: var(--text-dim);
|
| 383 |
+
font-size: 0.85rem;
|
| 384 |
+
font-family: inherit;
|
| 385 |
+
cursor: pointer;
|
| 386 |
+
text-align: left;
|
| 387 |
+
transition: all 0.1s;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
.custom-select-option:hover {
|
| 391 |
+
background: var(--surface2);
|
| 392 |
+
color: var(--text);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
.custom-select-option-active {
|
| 396 |
+
background: var(--accent);
|
| 397 |
+
color: #fff;
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
.custom-select-option-active:hover {
|
| 401 |
+
background: var(--accent-dim);
|
| 402 |
+
color: #fff;
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
/* ---- Server Error Banner ---- */
|
| 406 |
+
.server-error-banner {
|
| 407 |
+
background: #3a1a1a;
|
| 408 |
+
color: var(--err);
|
| 409 |
+
border: 1px solid #5a2a2a;
|
| 410 |
+
border-radius: var(--radius);
|
| 411 |
+
padding: 12px 16px;
|
| 412 |
+
margin-bottom: 20px;
|
| 413 |
+
font-size: 0.85rem;
|
| 414 |
+
line-height: 1.5;
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
/* ---- Content ---- */
|
| 418 |
+
.content {
|
| 419 |
+
min-height: 400px;
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
/* ---- Cards / Panels ---- */
|
| 423 |
+
.panel {
|
| 424 |
+
background: var(--surface);
|
| 425 |
+
border: 1px solid var(--border);
|
| 426 |
+
border-radius: var(--radius);
|
| 427 |
+
padding: 20px;
|
| 428 |
+
margin-bottom: 16px;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
.panel h2 {
|
| 432 |
+
font-size: 1.1rem;
|
| 433 |
+
font-weight: 600;
|
| 434 |
+
margin-bottom: 12px;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
.panel h3 {
|
| 438 |
+
font-size: 0.95rem;
|
| 439 |
+
font-weight: 600;
|
| 440 |
+
margin-bottom: 8px;
|
| 441 |
+
color: var(--text-dim);
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
/* ---- Forms ---- */
|
| 445 |
+
.form-row {
|
| 446 |
+
display: flex;
|
| 447 |
+
gap: 12px;
|
| 448 |
+
margin-bottom: 12px;
|
| 449 |
+
flex-wrap: wrap;
|
| 450 |
+
align-items: flex-end;
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
.form-group {
|
| 454 |
+
display: flex;
|
| 455 |
+
flex-direction: column;
|
| 456 |
+
gap: 4px;
|
| 457 |
+
flex: 1;
|
| 458 |
+
min-width: 180px;
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
.form-group label {
|
| 462 |
+
font-size: 0.8rem;
|
| 463 |
+
font-weight: 500;
|
| 464 |
+
color: var(--text-dim);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
input,
|
| 468 |
+
textarea,
|
| 469 |
+
select {
|
| 470 |
+
padding: 8px 12px;
|
| 471 |
+
background: var(--bg);
|
| 472 |
+
border: 1px solid var(--border);
|
| 473 |
+
border-radius: var(--radius);
|
| 474 |
+
color: var(--text);
|
| 475 |
+
font-size: 0.875rem;
|
| 476 |
+
font-family: inherit;
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
input:focus,
|
| 480 |
+
textarea:focus,
|
| 481 |
+
select:focus {
|
| 482 |
+
outline: none;
|
| 483 |
+
border-color: var(--accent);
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
textarea {
|
| 487 |
+
resize: vertical;
|
| 488 |
+
min-height: 100px;
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
/* ---- Buttons ---- */
|
| 492 |
+
button.btn {
|
| 493 |
+
padding: 8px 20px;
|
| 494 |
+
border: none;
|
| 495 |
+
border-radius: var(--radius);
|
| 496 |
+
font-size: 0.875rem;
|
| 497 |
+
font-weight: 600;
|
| 498 |
+
cursor: pointer;
|
| 499 |
+
transition: background 0.15s, opacity 0.15s;
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
.btn-primary {
|
| 503 |
+
background: var(--accent);
|
| 504 |
+
color: #fff;
|
| 505 |
+
}
|
| 506 |
+
.btn-primary:hover:not(:disabled) {
|
| 507 |
+
background: var(--accent-dim);
|
| 508 |
+
}
|
| 509 |
+
.btn-secondary {
|
| 510 |
+
background: var(--surface2);
|
| 511 |
+
color: var(--text);
|
| 512 |
+
}
|
| 513 |
+
.btn-secondary:hover:not(:disabled) {
|
| 514 |
+
background: var(--border);
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
button:disabled {
|
| 518 |
+
opacity: 0.5;
|
| 519 |
+
cursor: not-allowed;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
/* ---- Results ---- */
|
| 523 |
+
.result-card {
|
| 524 |
+
background: var(--surface2);
|
| 525 |
+
border: 1px solid var(--border);
|
| 526 |
+
border-radius: var(--radius);
|
| 527 |
+
padding: 16px;
|
| 528 |
+
margin-bottom: 10px;
|
| 529 |
+
transition: border-color 0.15s;
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
.result-card:hover {
|
| 533 |
+
border-color: var(--accent-dim);
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
.result-card .result-header {
|
| 537 |
+
display: flex;
|
| 538 |
+
justify-content: space-between;
|
| 539 |
+
align-items: center;
|
| 540 |
+
margin-bottom: 8px;
|
| 541 |
+
gap: 8px;
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
.result-card-selected {
|
| 545 |
+
border-color: var(--accent);
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
.result-card .result-text {
|
| 549 |
+
font-size: 0.85rem;
|
| 550 |
+
color: var(--text-dim);
|
| 551 |
+
line-height: 1.6;
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
.score-bar-container {
|
| 555 |
+
display: flex;
|
| 556 |
+
align-items: center;
|
| 557 |
+
gap: 8px;
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
.score-bar {
|
| 561 |
+
width: 120px;
|
| 562 |
+
height: 6px;
|
| 563 |
+
background: var(--bg);
|
| 564 |
+
border-radius: 3px;
|
| 565 |
+
overflow: hidden;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
.score-bar-fill {
|
| 569 |
+
height: 100%;
|
| 570 |
+
border-radius: 3px;
|
| 571 |
+
transition: width 0.3s;
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
.score-label {
|
| 575 |
+
font-size: 0.8rem;
|
| 576 |
+
font-weight: 700;
|
| 577 |
+
font-variant-numeric: tabular-nums;
|
| 578 |
+
min-width: 48px;
|
| 579 |
+
text-align: right;
|
| 580 |
+
}
|
| 581 |
+
|
| 582 |
+
/* ---- Similarity gauge ---- */
|
| 583 |
+
.similarity-gauge {
|
| 584 |
+
display: flex;
|
| 585 |
+
align-items: center;
|
| 586 |
+
justify-content: center;
|
| 587 |
+
flex-direction: column;
|
| 588 |
+
padding: 24px;
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
.similarity-value {
|
| 592 |
+
font-size: 3rem;
|
| 593 |
+
font-weight: 800;
|
| 594 |
+
font-variant-numeric: tabular-nums;
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
.similarity-label {
|
| 598 |
+
font-size: 0.9rem;
|
| 599 |
+
color: var(--text-dim);
|
| 600 |
+
margin-top: 4px;
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
/* ---- Status / Alerts ---- */
|
| 604 |
+
.status {
|
| 605 |
+
padding: 10px 14px;
|
| 606 |
+
border-radius: var(--radius);
|
| 607 |
+
font-size: 0.85rem;
|
| 608 |
+
margin-bottom: 12px;
|
| 609 |
+
}
|
| 610 |
+
.status-ok {
|
| 611 |
+
background: #1a3a2a;
|
| 612 |
+
color: var(--ok);
|
| 613 |
+
}
|
| 614 |
+
.status-err {
|
| 615 |
+
background: #3a1a1a;
|
| 616 |
+
color: var(--err);
|
| 617 |
+
}
|
| 618 |
+
.status-loading {
|
| 619 |
+
background: var(--surface2);
|
| 620 |
+
color: var(--text-dim);
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
/* ---- Table ---- */
|
| 624 |
+
.data-table {
|
| 625 |
+
width: 100%;
|
| 626 |
+
border-collapse: collapse;
|
| 627 |
+
font-size: 0.85rem;
|
| 628 |
+
}
|
| 629 |
+
.data-table th,
|
| 630 |
+
.data-table td {
|
| 631 |
+
padding: 8px 12px;
|
| 632 |
+
text-align: left;
|
| 633 |
+
border-bottom: 1px solid var(--border);
|
| 634 |
+
}
|
| 635 |
+
.data-table th {
|
| 636 |
+
color: var(--text-dim);
|
| 637 |
+
font-weight: 600;
|
| 638 |
+
font-size: 0.8rem;
|
| 639 |
+
text-transform: uppercase;
|
| 640 |
+
letter-spacing: 0.5px;
|
| 641 |
+
}
|
| 642 |
+
.data-table tr:hover td {
|
| 643 |
+
background: var(--surface2);
|
| 644 |
+
}
|
| 645 |
+
.data-table input,
|
| 646 |
+
.data-table select {
|
| 647 |
+
font-size: 0.85rem;
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
/* ---- Grid ---- */
|
| 651 |
+
.grid-2 {
|
| 652 |
+
display: grid;
|
| 653 |
+
grid-template-columns: 1fr 1fr;
|
| 654 |
+
gap: 16px;
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
@media (max-width: 768px) {
|
| 658 |
+
.grid-2 {
|
| 659 |
+
grid-template-columns: 1fr;
|
| 660 |
+
}
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
/* ---- Metric Card ---- */
|
| 664 |
+
.metric-card {
|
| 665 |
+
background: var(--surface2);
|
| 666 |
+
border: 1px solid var(--border);
|
| 667 |
+
border-radius: var(--radius);
|
| 668 |
+
padding: 16px;
|
| 669 |
+
text-align: center;
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
.metric-value {
|
| 673 |
+
font-size: 1.75rem;
|
| 674 |
+
font-weight: 700;
|
| 675 |
+
color: var(--text);
|
| 676 |
+
font-variant-numeric: tabular-nums;
|
| 677 |
+
}
|
| 678 |
+
|
| 679 |
+
.metric-label {
|
| 680 |
+
font-size: 0.78rem;
|
| 681 |
+
color: var(--text-dim);
|
| 682 |
+
margin-top: 4px;
|
| 683 |
+
text-transform: uppercase;
|
| 684 |
+
letter-spacing: 0.3px;
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
+
/* ---- Spinner ---- */
|
| 688 |
+
.spinner {
|
| 689 |
+
display: inline-block;
|
| 690 |
+
width: 16px;
|
| 691 |
+
height: 16px;
|
| 692 |
+
border: 2px solid var(--text-dim);
|
| 693 |
+
border-top-color: var(--accent);
|
| 694 |
+
border-radius: 50%;
|
| 695 |
+
animation: spin 0.6s linear infinite;
|
| 696 |
+
margin-right: 6px;
|
| 697 |
+
vertical-align: middle;
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
@keyframes spin {
|
| 701 |
+
to {
|
| 702 |
+
transform: rotate(360deg);
|
| 703 |
+
}
|
| 704 |
+
}
|
| 705 |
+
|
| 706 |
+
/* ---- Tags ---- */
|
| 707 |
+
.tag {
|
| 708 |
+
display: inline-block;
|
| 709 |
+
padding: 2px 8px;
|
| 710 |
+
border-radius: 4px;
|
| 711 |
+
font-size: 0.75rem;
|
| 712 |
+
font-weight: 600;
|
| 713 |
+
background: var(--surface);
|
| 714 |
+
margin: 2px;
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
.tag-best {
|
| 718 |
+
background: #1a3a2a;
|
| 719 |
+
color: var(--ok);
|
| 720 |
+
}
|
| 721 |
+
|
| 722 |
+
/* ---- Utility classes ---- */
|
| 723 |
+
.panel-desc {
|
| 724 |
+
color: var(--text-dim);
|
| 725 |
+
font-size: 0.85rem;
|
| 726 |
+
margin-bottom: 12px;
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
.section-label {
|
| 730 |
+
font-size: 0.8rem;
|
| 731 |
+
font-weight: 600;
|
| 732 |
+
color: var(--text-dim);
|
| 733 |
+
margin-bottom: 4px;
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
.text-dim { color: var(--text-dim); }
|
| 737 |
+
|
| 738 |
+
.form-group-sm { max-width: 100px; }
|
| 739 |
+
.form-group-md { max-width: 140px; }
|
| 740 |
+
.form-group-lg { max-width: 220px; }
|
| 741 |
+
|
| 742 |
+
.metric-grid {
|
| 743 |
+
display: flex;
|
| 744 |
+
gap: 16px;
|
| 745 |
+
flex-wrap: wrap;
|
| 746 |
+
}
|
| 747 |
+
.metric-grid > * {
|
| 748 |
+
flex: 1 1 100px;
|
| 749 |
+
}
|
| 750 |
+
|
| 751 |
+
.flex-row { display: flex; gap: 8px; }
|
| 752 |
+
.flex-col { display: flex; flex-direction: column; }
|
| 753 |
+
.flex-wrap { flex-wrap: wrap; }
|
| 754 |
+
|
| 755 |
+
.gap-1 { gap: 8px; }
|
| 756 |
+
.gap-2 { gap: 12px; }
|
| 757 |
+
.gap-3 { gap: 16px; }
|
| 758 |
+
|
| 759 |
+
.mt-1 { margin-top: 8px; }
|
| 760 |
+
.mt-2 { margin-top: 12px; }
|
| 761 |
+
.mt-3 { margin-top: 16px; }
|
| 762 |
+
.mb-1 { margin-bottom: 8px; }
|
| 763 |
+
.mb-2 { margin-bottom: 12px; }
|
| 764 |
+
.mb-3 { margin-bottom: 16px; }
|
| 765 |
+
|
| 766 |
+
/* ---- Context Analysis bar chart ---- */
|
| 767 |
+
.context-bar-row {
|
| 768 |
+
display: flex;
|
| 769 |
+
align-items: center;
|
| 770 |
+
gap: 10px;
|
| 771 |
+
margin-bottom: 6px;
|
| 772 |
+
}
|
| 773 |
+
|
| 774 |
+
.context-bar-label {
|
| 775 |
+
width: 90px;
|
| 776 |
+
font-size: 0.82rem;
|
| 777 |
+
font-weight: 600;
|
| 778 |
+
text-align: right;
|
| 779 |
+
color: var(--text);
|
| 780 |
+
flex-shrink: 0;
|
| 781 |
+
}
|
| 782 |
+
|
| 783 |
+
.context-bar-track {
|
| 784 |
+
flex: 1;
|
| 785 |
+
height: 8px;
|
| 786 |
+
background: var(--bg);
|
| 787 |
+
border-radius: 4px;
|
| 788 |
+
overflow: hidden;
|
| 789 |
+
}
|
| 790 |
+
|
| 791 |
+
.context-bar-fill {
|
| 792 |
+
height: 100%;
|
| 793 |
+
background: var(--accent);
|
| 794 |
+
border-radius: 4px;
|
| 795 |
+
transition: width 0.3s;
|
| 796 |
+
}
|
| 797 |
+
|
| 798 |
+
.context-bar-value {
|
| 799 |
+
font-size: 0.75rem;
|
| 800 |
+
color: var(--text-dim);
|
| 801 |
+
width: 40px;
|
| 802 |
+
text-align: right;
|
| 803 |
+
flex-shrink: 0;
|
| 804 |
+
}
|
| 805 |
+
|
| 806 |
+
.context-snippet {
|
| 807 |
+
font-size: 0.8rem;
|
| 808 |
+
color: var(--text-dim);
|
| 809 |
+
line-height: 1.5;
|
| 810 |
+
padding: 8px 10px;
|
| 811 |
+
background: var(--bg);
|
| 812 |
+
border-radius: 6px;
|
| 813 |
+
margin-bottom: 4px;
|
| 814 |
+
}
|
| 815 |
+
|
| 816 |
+
.context-snippet-source {
|
| 817 |
+
font-size: 0.7rem;
|
| 818 |
+
color: var(--accent);
|
| 819 |
+
margin-right: 6px;
|
| 820 |
+
}
|
| 821 |
+
|
| 822 |
+
.score-bar-legend {
|
| 823 |
+
display: flex;
|
| 824 |
+
justify-content: space-between;
|
| 825 |
+
font-size: 0.75rem;
|
| 826 |
+
color: var(--text-dim);
|
| 827 |
+
margin-top: 4px;
|
| 828 |
+
}
|
frontend/src/types.ts
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// ---- API Request types ----
|
| 2 |
+
|
| 3 |
+
export interface InitRequest {
|
| 4 |
+
model_name: string;
|
| 5 |
+
chunk_size: number;
|
| 6 |
+
chunk_overlap: number;
|
| 7 |
+
batch_size: number;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
export interface DocumentRequest {
|
| 11 |
+
doc_id: string;
|
| 12 |
+
text: string;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
export interface QueryRequest {
|
| 16 |
+
text: string;
|
| 17 |
+
top_k: number;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
export interface CompareRequest {
|
| 21 |
+
text_a: string;
|
| 22 |
+
text_b: string;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
export interface KeywordAnalysisRequest {
|
| 26 |
+
keyword: string;
|
| 27 |
+
top_k: number;
|
| 28 |
+
cluster_threshold: number;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
export interface KeywordMatchRequest {
|
| 32 |
+
keyword: string;
|
| 33 |
+
candidate_meanings: string[];
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
export interface BatchAnalysisRequest {
|
| 37 |
+
keywords: string[];
|
| 38 |
+
top_k: number;
|
| 39 |
+
cluster_threshold: number;
|
| 40 |
+
compare_across: boolean;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
// ---- API Response types ----
|
| 44 |
+
|
| 45 |
+
export interface ChunkPreview {
|
| 46 |
+
index: number;
|
| 47 |
+
text: string;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
export interface InitResponse {
|
| 51 |
+
status: string;
|
| 52 |
+
model: string;
|
| 53 |
+
load_time_seconds: number;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
export interface AddDocResponse {
|
| 57 |
+
status: string;
|
| 58 |
+
doc_id: string;
|
| 59 |
+
num_chunks: number;
|
| 60 |
+
chunks_preview: ChunkPreview[];
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
export interface BuildIndexResponse {
|
| 64 |
+
status: string;
|
| 65 |
+
total_chunks: number;
|
| 66 |
+
embedding_dim: number;
|
| 67 |
+
build_time_seconds: number;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
export interface QueryResultItem {
|
| 71 |
+
rank: number;
|
| 72 |
+
score: number;
|
| 73 |
+
doc_id: string;
|
| 74 |
+
chunk_index: number;
|
| 75 |
+
text: string;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
export interface QueryResponse {
|
| 79 |
+
query: string;
|
| 80 |
+
results: QueryResultItem[];
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
export interface CompareResponse {
|
| 84 |
+
text_a: string;
|
| 85 |
+
text_b: string;
|
| 86 |
+
similarity: number;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
export interface ClusterContext {
|
| 90 |
+
doc_id: string;
|
| 91 |
+
chunk_index: number;
|
| 92 |
+
text: string;
|
| 93 |
+
highlight_positions: [number, number][];
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
export interface SimilarPassage {
|
| 97 |
+
rank: number;
|
| 98 |
+
score: number;
|
| 99 |
+
doc_id: string;
|
| 100 |
+
text: string;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
export interface MeaningCluster {
|
| 104 |
+
cluster_id: number;
|
| 105 |
+
size: number;
|
| 106 |
+
representative_text: string;
|
| 107 |
+
contexts: ClusterContext[];
|
| 108 |
+
similar_passages: SimilarPassage[];
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
export interface KeywordAnalysisResponse {
|
| 112 |
+
keyword: string;
|
| 113 |
+
total_occurrences: number;
|
| 114 |
+
meaning_clusters: MeaningCluster[];
|
| 115 |
+
cross_keyword_similarities: Record<string, number>;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
export interface MatchResult {
|
| 119 |
+
doc_id: string;
|
| 120 |
+
chunk_index: number;
|
| 121 |
+
text: string;
|
| 122 |
+
best_match: string;
|
| 123 |
+
best_score: number;
|
| 124 |
+
all_scores: Record<string, number>;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
export interface MatchResponse {
|
| 128 |
+
keyword: string;
|
| 129 |
+
candidate_meanings: string[];
|
| 130 |
+
matches: MatchResult[];
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
export interface CorpusStats {
|
| 134 |
+
total_chunks: number;
|
| 135 |
+
total_documents: number;
|
| 136 |
+
document_ids: string[];
|
| 137 |
+
index_built: boolean;
|
| 138 |
+
embedding_dim: number;
|
| 139 |
+
model_name: string;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
export interface SimilarityDistribution {
|
| 143 |
+
sample_size: number;
|
| 144 |
+
mean: number;
|
| 145 |
+
std: number;
|
| 146 |
+
min: number;
|
| 147 |
+
max: number;
|
| 148 |
+
percentiles: Record<string, number>;
|
| 149 |
+
histogram: { bin_start: number; bin_end: number; count: number }[];
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
export interface DisambiguationMetric {
|
| 153 |
+
keyword: string;
|
| 154 |
+
accuracy: number;
|
| 155 |
+
weighted_f1: number;
|
| 156 |
+
per_meaning_precision: Record<string, number>;
|
| 157 |
+
per_meaning_recall: Record<string, number>;
|
| 158 |
+
per_meaning_f1: Record<string, number>;
|
| 159 |
+
confusion_matrix: number[][];
|
| 160 |
+
total_samples: number;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
export interface RetrievalMetric {
|
| 164 |
+
query: string;
|
| 165 |
+
mrr: number;
|
| 166 |
+
precision_at_k: Record<string, number>;
|
| 167 |
+
recall_at_k: Record<string, number>;
|
| 168 |
+
ndcg_at_k: Record<string, number>;
|
| 169 |
+
avg_similarity: number;
|
| 170 |
+
top_score: number;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
// ---- Training types ----
|
| 174 |
+
|
| 175 |
+
export interface TrainResponse {
|
| 176 |
+
strategy: string;
|
| 177 |
+
model_path: string;
|
| 178 |
+
training_pairs: number;
|
| 179 |
+
epochs: number;
|
| 180 |
+
seconds: number;
|
| 181 |
+
keywords?: string[];
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
export interface TrainEvalResponse {
|
| 185 |
+
pairs: {
|
| 186 |
+
text_a: string;
|
| 187 |
+
text_b: string;
|
| 188 |
+
expected: number;
|
| 189 |
+
base_score: number;
|
| 190 |
+
trained_score: number;
|
| 191 |
+
base_error: number;
|
| 192 |
+
trained_error: number;
|
| 193 |
+
}[];
|
| 194 |
+
summary: {
|
| 195 |
+
avg_base_error: number;
|
| 196 |
+
avg_trained_error: number;
|
| 197 |
+
error_reduction_pct: number;
|
| 198 |
+
improved: number;
|
| 199 |
+
degraded: number;
|
| 200 |
+
total: number;
|
| 201 |
+
};
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
// ---- Word2Vec types ----
|
| 205 |
+
|
| 206 |
+
export interface W2VInitResponse {
|
| 207 |
+
vocab_size: number;
|
| 208 |
+
sentences: number;
|
| 209 |
+
vector_size: number;
|
| 210 |
+
seconds: number;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
export interface W2VQueryResult {
|
| 214 |
+
rank: number;
|
| 215 |
+
score: number;
|
| 216 |
+
doc_id: string;
|
| 217 |
+
text: string;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
export interface W2VSimilarWord {
|
| 221 |
+
word: string;
|
| 222 |
+
score: number;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
// ---- Dataset types ----
|
| 226 |
+
|
| 227 |
+
export interface DatasetSourceInfo {
|
| 228 |
+
dataset_id: string;
|
| 229 |
+
url: string;
|
| 230 |
+
description: string;
|
| 231 |
+
columns?: string[];
|
| 232 |
+
size_mb?: number;
|
| 233 |
+
model?: string;
|
| 234 |
+
vector_dim?: number;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
export interface DatasetInfo {
|
| 238 |
+
raw_texts: DatasetSourceInfo;
|
| 239 |
+
embeddings: DatasetSourceInfo;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
export interface DatasetLoadRequest {
|
| 243 |
+
source: "raw" | "embeddings";
|
| 244 |
+
max_docs: number;
|
| 245 |
+
min_text_length: number;
|
| 246 |
+
source_filter?: string;
|
| 247 |
+
build_index: boolean;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
export interface DatasetLoadResponse {
|
| 251 |
+
documents_loaded?: number;
|
| 252 |
+
documents_skipped?: number;
|
| 253 |
+
documents_created?: number;
|
| 254 |
+
total_chunks?: number;
|
| 255 |
+
chunks_indexed?: number;
|
| 256 |
+
chromadb_vectors?: number;
|
| 257 |
+
index_built: boolean;
|
| 258 |
+
seconds: number;
|
| 259 |
+
source?: string;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
export interface DatasetPreviewDoc {
|
| 263 |
+
doc_id: string;
|
| 264 |
+
filename: string;
|
| 265 |
+
text_preview: string;
|
| 266 |
+
text_length: number;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
export interface DatasetPreviewResponse {
|
| 270 |
+
count: number;
|
| 271 |
+
documents: DatasetPreviewDoc[];
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
// ---- Context Analysis types ----
|
| 275 |
+
|
| 276 |
+
export interface ContextAssociatedWord {
|
| 277 |
+
word: string;
|
| 278 |
+
score: number;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
export interface ContextExample {
|
| 282 |
+
doc_id: string;
|
| 283 |
+
snippet: string;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
export interface ContextMeaning {
|
| 287 |
+
cluster_id: number;
|
| 288 |
+
occurrences: number;
|
| 289 |
+
confidence: number;
|
| 290 |
+
associated_words: ContextAssociatedWord[];
|
| 291 |
+
example_contexts: ContextExample[];
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
export interface ContextAnalysisResponse {
|
| 295 |
+
keyword: string;
|
| 296 |
+
total_occurrences: number;
|
| 297 |
+
meanings: ContextMeaning[];
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
// ---- UI State ----
|
| 301 |
+
|
| 302 |
+
export type EvalSection = "distribution" | "disambiguation" | "retrieval";
|
frontend/src/utils/colors.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** Map a 0–1 similarity/score to a CSS color variable. */
|
| 2 |
+
export function scoreColor(score: number): string {
|
| 3 |
+
if (score >= 0.7) return "var(--ok)";
|
| 4 |
+
if (score >= 0.4) return "var(--warn)";
|
| 5 |
+
return "var(--err)";
|
| 6 |
+
}
|
frontend/src/vite-env.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/// <reference types="vite/client" />
|
frontend/tsconfig.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2020",
|
| 4 |
+
"useDefineForClassFields": true,
|
| 5 |
+
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
| 6 |
+
"module": "ESNext",
|
| 7 |
+
"skipLibCheck": true,
|
| 8 |
+
"moduleResolution": "bundler",
|
| 9 |
+
"allowImportingTsExtensions": true,
|
| 10 |
+
"isolatedModules": true,
|
| 11 |
+
"moduleDetection": "force",
|
| 12 |
+
"noEmit": true,
|
| 13 |
+
"jsx": "react-jsx",
|
| 14 |
+
"strict": true,
|
| 15 |
+
"noUnusedLocals": false,
|
| 16 |
+
"noUnusedParameters": false,
|
| 17 |
+
"noFallthroughCasesInSwitch": true,
|
| 18 |
+
"forceConsistentCasingInFileNames": true
|
| 19 |
+
},
|
| 20 |
+
"include": ["src"]
|
| 21 |
+
}
|
frontend/vite.config.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from "vite";
|
| 2 |
+
import react from "@vitejs/plugin-react";
|
| 3 |
+
|
| 4 |
+
export default defineConfig({
|
| 5 |
+
plugins: [react()],
|
| 6 |
+
server: {
|
| 7 |
+
proxy: {
|
| 8 |
+
"/api/logs/stream": {
|
| 9 |
+
target: "http://localhost:8000",
|
| 10 |
+
headers: { "Accept": "text/event-stream" },
|
| 11 |
+
},
|
| 12 |
+
"/api": "http://localhost:8000",
|
| 13 |
+
},
|
| 14 |
+
},
|
| 15 |
+
});
|
pyproject.toml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "esfiles-ndr"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "Contextual word similarity analysis using transformer embeddings and Word2Vec baseline"
|
| 5 |
+
requires-python = ">=3.11"
|
| 6 |
+
dependencies = [
|
| 7 |
+
"sentence-transformers>=5.2.3",
|
| 8 |
+
"faiss-cpu>=1.13.2",
|
| 9 |
+
"torch>=2.10.0",
|
| 10 |
+
"numpy>=2.4.3",
|
| 11 |
+
"scikit-learn>=1.8.0",
|
| 12 |
+
"tqdm>=4.67.3",
|
| 13 |
+
"gensim>=4.4.0",
|
| 14 |
+
"fastapi>=0.135.1",
|
| 15 |
+
"uvicorn[standard]>=0.41.0",
|
| 16 |
+
"python-multipart>=0.0.22",
|
| 17 |
+
"accelerate>=1.13.0",
|
| 18 |
+
"datasets>=4.7.0",
|
| 19 |
+
"chromadb>=1.5.4",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
[project.scripts]
|
| 23 |
+
serve = "server:main"
|
| 24 |
+
demo = "demo:main"
|
| 25 |
+
|
| 26 |
+
[tool.uv]
|
| 27 |
+
dev-dependencies = []
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentence-transformers>=5.2.3
|
| 2 |
+
faiss-cpu>=1.13.2
|
| 3 |
+
torch>=2.10.0
|
| 4 |
+
numpy>=2.4.3
|
| 5 |
+
scikit-learn>=1.8.0
|
| 6 |
+
tqdm>=4.67.3
|
| 7 |
+
gensim>=4.4.0
|
| 8 |
+
fastapi>=0.135.1
|
| 9 |
+
uvicorn[standard]>=0.41.0
|
| 10 |
+
python-multipart>=0.0.22
|
| 11 |
+
datasets>=4.7.0
|
| 12 |
+
chromadb>=1.5.4
|