github-actions commited on
Commit
5a3b322
·
1 Parent(s): c44ff26

Sync from GitHub 2025-12-17T12:18:53Z

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .dockerignore +27 -0
  3. .env.example +16 -0
  4. .gitattributes +0 -35
  5. .github/workflows/push_to_hf_space.yml +38 -0
  6. .gitignore +20 -0
  7. Dockerfile +23 -0
  8. Makefile +30 -0
  9. README.md +149 -12
  10. agent/app.py +76 -0
  11. agent/chat_agent.py +156 -0
  12. agent/router_agent.py +86 -0
  13. agent/server.py +324 -0
  14. api/__init__.py +1 -0
  15. config.py +75 -0
  16. configs/__init__.py +1 -0
  17. configs/config.yaml +43 -0
  18. configs/embedding_config.yaml +10 -0
  19. configs/retrieval.yaml +5 -0
  20. crawler/__init__.py +1 -0
  21. crawler/backfill_labels.py +72 -0
  22. crawler/export.py +94 -0
  23. crawler/fetcher.py +102 -0
  24. crawler/parser_catalog.py +143 -0
  25. crawler/parser_detail.py +320 -0
  26. crawler/qa_checks.py +74 -0
  27. crawler/robots.py +35 -0
  28. crawler/run.py +165 -0
  29. crawler/storage.py +209 -0
  30. crawler/utils.py +61 -0
  31. docker-compose.yml +26 -0
  32. embeddings/generator.py +68 -0
  33. eval/__init__.py +1 -0
  34. eval/compare_runs.py +34 -0
  35. eval/diagnostic_topk.py +88 -0
  36. eval/metrics.py +27 -0
  37. eval/run_eval.py +238 -0
  38. frontend/.dockerignore +9 -0
  39. frontend/Dockerfile +19 -0
  40. frontend/index.html +43 -0
  41. frontend/next-env.d.ts +5 -0
  42. frontend/next.config.mjs +8 -0
  43. frontend/out/404.html +1 -0
  44. frontend/out/_next/static/chunks/23-02b97631d99e6f05.js +0 -0
  45. frontend/out/_next/static/chunks/app/_not-found/page-a99a188ec9244b3f.js +1 -0
  46. frontend/out/_next/static/chunks/app/layout-fc95adeb217fd9c8.js +1 -0
  47. frontend/out/_next/static/chunks/app/page-73ea6ec0ec8fa438.js +16 -0
  48. frontend/out/_next/static/chunks/fd9d1056-0eb575322ff5015c.js +0 -0
  49. frontend/out/_next/static/chunks/framework-aec844d2ccbe7592.js +0 -0
  50. frontend/out/_next/static/chunks/main-app-df951a18dbec0e17.js +1 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.dockerignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .env
4
+ .env.*
5
+ .venv
6
+ venv
7
+ __pycache__
8
+ *.pyc
9
+ *.pyo
10
+ .pytest_cache
11
+ .mypy_cache
12
+ .ruff_cache
13
+ .model_cache
14
+ .cache
15
+ node_modules
16
+ frontend/node_modules
17
+ frontend/.next
18
+ frontend/.turbo
19
+ frontend/.vercel
20
+ runs
21
+ logs
22
+ *.log
23
+ *.tmp
24
+ *.swp
25
+ *.swo
26
+ *.orig
27
+ *.DS_Store
.env.example ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Application
2
+ APP_ENV=local
3
+ LOG_LEVEL=INFO
4
+ CONFIG_PATH=configs/config.yaml
5
+ USER_AGENT=llm-recommendation-engine/0.1 (+https://example.com)
6
+ START_URL=https://www.shl.com/products/product-catalog/
7
+ MAX_CONCURRENCY=2
8
+ REQUEST_DELAY_SECONDS=1.5
9
+ JITTER_SECONDS=0.5
10
+ MAX_RETRIES=3
11
+ ALLOW_ROBOTS_BYPASS=0
12
+
13
+ # External services / secrets
14
+ OPENAI_API_KEY=replace_me
15
+ VECTOR_DB_URL=replace_me
16
+ TRACING_ENDPOINT=replace_me
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.github/workflows/push_to_hf_space.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Push to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main" ]
6
+
7
+ jobs:
8
+ sync-to-hf:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout
12
+ uses: actions/checkout@v4
13
+ with:
14
+ lfs: true
15
+
16
+ - name: Push to HF Space
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: |
20
+ git config --global user.email "ci@github"
21
+ git config --global user.name "github-actions"
22
+
23
+ # Clone the Space repo
24
+ git clone https://AgamP:$HF_TOKEN@huggingface.co/spaces/AgamP/llm_recommendation_backend hf_space
25
+
26
+ # Replace Space contents with GitHub repo contents (except .git)
27
+
28
+ rsync -av --delete \
29
+ --exclude ".git" \
30
+ --exclude "hf_space" \
31
+ --exclude "*.pdf" \
32
+ ./ hf_space/
33
+
34
+
35
+ cd hf_space
36
+ git add -A
37
+ git commit -m "Sync from GitHub $(date -u +'%Y-%m-%dT%H:%M:%SZ')" || echo "No changes"
38
+ git push
.gitignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ .pytest_cache/
4
+ *.sqlite
5
+ *.db
6
+ logs/
7
+ data/crawler.db
8
+ data/catalog.parquet
9
+ data/catalog.jsonl
10
+ playwright-report/
11
+ playwright/.cache/
12
+ node_modules/
13
+ venv/
14
+ runs/
15
+ frontend/.next/
16
+ frontend/node_modules/
17
+ .model_cache/
18
+ data
19
+ data/
20
+ models/reranker_crossenc/v0.1.0/
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /app
8
+
9
+ # System deps for numpy/faiss/scipy style builds; drop if wheels suffice
10
+ RUN apt-get update && \
11
+ apt-get install -y --no-install-recommends build-essential && \
12
+ rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy source after deps to leverage Docker layer caching
18
+ COPY . .
19
+
20
+ EXPOSE 8000
21
+ # Render sets PORT; default to 8000 for local use
22
+ ENV PORT=8000
23
+ CMD ["sh", "-c", "uvicorn agent.server:app --host 0.0.0.0 --port ${PORT} --workers 2"]
Makefile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PYTHON ?= python3
2
+ VENV ?= .venv
3
+ ACTIVATE = . $(VENV)/bin/activate
4
+ APP_NAME ?= llm-recommender
5
+
6
+ .PHONY: setup install config-check lint test docker-build docker-run clean
7
+
8
+ setup:
9
+ $(PYTHON) -m venv $(VENV)
10
+
11
+ install: setup
12
+ $(ACTIVATE) && pip install --upgrade pip && pip install -r requirements.txt
13
+
14
+ config-check:
15
+ $(ACTIVATE) && PYTHONPATH=. $(PYTHON) config.py --print
16
+
17
+ lint:
18
+ @echo "Add linting tools (ruff/black/flake8) here"
19
+
20
+ test:
21
+ $(ACTIVATE) && PYTHONPATH=. pytest
22
+
23
+ docker-build:
24
+ docker build -t $(APP_NAME):dev .
25
+
26
+ docker-run:
27
+ docker run --rm -it -p 8000:8000 -p 3000:3000 --env-file .env.example $(APP_NAME):dev
28
+
29
+ clean:
30
+ rm -rf $(VENV) __pycache__ */__pycache__
README.md CHANGED
@@ -1,12 +1,149 @@
1
- ---
2
- title: Llm Recommendation Backend
3
- emoji: 📚
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- short_description: 'fastapi backend for llm recomemndation engine for shl '
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm_recommendation_engine
2
+ Recommendation engine for SHL's product catalogue with conversational agents
3
+
4
+ ## Quick commands (crawler + export + QA)
5
+ - Install deps (and Playwright browser): `python -m pip install -r requirements.txt && python -m playwright install chromium`
6
+ - Clean DB: `rm -f data/crawler.db`
7
+ - Crawl (bypass robots if needed): `ALLOW_ROBOTS_BYPASS=1 python -m crawler.run --mode=crawl_all --max-discover=20`
8
+ - Drop `--max-discover` for full crawl.
9
+ - Export dataset: `python -m crawler.run --mode=export --limit-export=20`
10
+ - Outputs: `data/catalog.parquet`, `data/catalog.jsonl`
11
+ - Drop `--limit-export` for full export.
12
+ - QA checks: `python -m crawler.qa_checks data/catalog.jsonl > data/qa_summary.json`
13
+ - Summary JSON saved to `data/qa_summary.json`
14
+
15
+ ## What’s implemented
16
+ - Playwright-based crawler with catalog pagination, detail fetch, and structured storage in SQLite.
17
+ - Field extraction: url, name, description, test_type (+full), remote/adaptive flags, duration (minutes/hours), job_levels, languages, downloads.
18
+ - Export to Parquet/JSONL plus QA summary script for downstream sanity checks.
19
+
20
+ ## Evaluation harness (Phase 2)
21
+ - Catalog loader with canonical IDs: `python -m data.catalog_loader --input data/catalog.jsonl --output data/catalog_with_ids.jsonl`
22
+ - Train loader + label resolution report: `python -m data.train_loader --catalog data/catalog.jsonl --train <train_file> --report data/label_resolution_report.json`
23
+ - Run eval (dummy baseline): `python -m eval.run_eval --catalog data/catalog.jsonl --train <train_file> --recommender dummy_random`
24
+ - Run eval (BM25 baseline): `python -m eval.run_eval --catalog data/catalog.jsonl --train <train_file> --recommender bm25`
25
+ - Outputs run folder under `runs/<timestamp>_<recommender>/` with `metrics.json`, `per_query_results.jsonl`, `worst_queries.csv`, `label_resolution_report.json`
26
+ - Compare runs: `python -m eval.compare_runs runs/<run_a> runs/<run_b>`
27
+
28
+ Recommender interface lives in `recommenders/base.py`; a random baseline is in `recommenders/dummy_random.py`. Metrics (Recall@k, MRR@10) are in `eval/metrics.py`.
29
+
30
+ ## Label probing & backfill (improve label coverage)
31
+ - Probe unmatched label URLs (after a label match run): `python -m scripts.probe_unmatched_labels --labels data/label_resolution_report.json --output reports/label_url_probe.csv` — classifies label URLs (valid detail vs 404/blocked).
32
+ - Backfill valid label pages into DB: `python -m crawler.backfill_labels --probe-csv reports/label_url_probe.csv --allow-robots-bypass` — fetches & inserts DETAIL_PAGE_VALID URLs.
33
+ - Re-export and rematch after backfill:
34
+ - `python -m crawler.run --mode=export`
35
+ - `python -m data.catalog_loader --input data/catalog.jsonl --output data/catalog_with_ids.jsonl`
36
+ - `python -m data.train_loader --catalog data/catalog.jsonl --train <train_file> --sheet "Train-Set" --report data/label_resolution_report.json`
37
+
38
+ ## Vector pipeline (semantic retrieval)
39
+ - Build doc_text: `python -m data.document_builder --input data/catalog.jsonl --output data/catalog_docs.jsonl`
40
+ - Generate embeddings: `python -m embeddings.generator --catalog data/catalog_docs.jsonl --model sentence-transformers/all-MiniLM-L6-v2 --output-dir data/embeddings`
41
+ - Build FAISS index: `python -m retrieval.build_index --embeddings data/embeddings/embeddings.npy --ids data/embeddings/assessment_ids.json --index-path data/faiss_index/index.faiss`
42
+ - Vector components:
43
+ - Model wrapper: `models/embedding_model.py`
44
+ - Index wrapper: `retrieval/vector_index.py`
45
+ - Index builder script: `retrieval/build_index.py`
46
+ - Vector recommender scaffold: `recommenders/vector_recommender.py` (wire with assessment_ids + index)
47
+
48
+ ## Hybrid retrieval (BM25 + vector with RRF)
49
+ - Run hybrid eval: `python -m eval.run_eval --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --recommender hybrid_rrf --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn-candidates 200 --rrf-k 60`
50
+ - Run hybrid + cross-encoder rerank: `python -m eval.run_eval --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --recommender hybrid_rrf_rerank --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2 --topn-candidates 200 --rrf-k 60`
51
+ - Run hybrid + LGBM rerank: `python -m eval.run_eval --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --recommender hybrid_rrf_lgbm --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn-candidates 200 --rrf-k 60 --lgbm-model models/reranker/v0.1.0/lgbm_model.txt --lgbm-features models/reranker/v0.1.0/feature_schema.json`
52
+ - Diagnostics (positives in top-N vs top-10): `python -m eval.diagnostic_topk --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn 200`
53
+ - Run ablation (bm25/vector/hybrid across topN): `python -m scripts.run_ablation --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn-list 100,200,377`
54
+
55
+ ## Current findings & next steps
56
+ - Candidate coverage is solved by top200; ranking is the bottleneck. Use union fusion + rerank.
57
+ - Locked decisions:
58
+ - Candidate pool (train): top200
59
+ - Candidate pool (infer): top100–200
60
+ - Base retriever: hybrid (BM25 + vector), union fusion, dual-query (raw + rewritten).
61
+ - Next: focus on reranking and constraint handling; no more embedding/model swaps.
62
+
63
+ ## Core pipeline (concise commands)
64
+
65
+ ### Build rich docs, embeddings, index (BGE)
66
+ ```bash
67
+ python -m data.document_builder \
68
+ --input data/catalog.jsonl \
69
+ --output data/catalog_docs_rich.jsonl \
70
+ --variant rich \
71
+ --version v2_struct
72
+
73
+ python -m embeddings.generator \
74
+ --catalog data/catalog_docs_rich.jsonl \
75
+ --model BAAI/bge-small-en-v1.5 \
76
+ --batch-size 32 \
77
+ --output-dir data/embeddings_bge
78
+
79
+ python -m retrieval.build_index \
80
+ --embeddings data/embeddings_bge/embeddings.npy \
81
+ --ids data/embeddings_bge/assessment_ids.json \
82
+ --index-path data/faiss_index/index_bge.faiss
83
+ ```
84
+
85
+ ### Build vocab for query rewriter (optional, recommended)
86
+ ```bash
87
+ python -m scripts.build_role_vocab \
88
+ --catalog data/catalog_docs_rich.jsonl \
89
+ --out data/catalog_role_vocab.json
90
+ ```
91
+
92
+ ### Evaluate hybrid + cross-encoder rerank (with rewriting and union fusion)
93
+ ```bash
94
+ python -m eval.run_eval \
95
+ --catalog data/catalog_docs_rich.jsonl \
96
+ --train data/Gen_AI\ Dataset.xlsx \
97
+ --recommender hybrid_rrf_rerank \
98
+ --vector-index data/faiss_index/index_bge.faiss \
99
+ --assessment-ids data/embeddings_bge/assessment_ids.json \
100
+ --model BAAI/bge-small-en-v1.5 \
101
+ --reranker-model models/reranker_crossenc/v0.1.0 \
102
+ --topn-candidates 200 --rrf-k 60 \
103
+ --use-rewriter --vocab data/catalog_role_vocab.json \
104
+ --out-dir runs/$(date +%Y%m%d_%H%M%S)_hybrid_rrf_rerank_rewrite
105
+ ```
106
+
107
+ ### Candidate coverage (bm25 vs vector vs hybrid; grouped per query)
108
+ ```bash
109
+ python -m scripts.candidate_coverage \
110
+ --catalog data/catalog_docs_rich.jsonl \
111
+ --train data/Gen_AI\ Dataset.xlsx \
112
+ --vector-index data/faiss_index/index_bge.faiss \
113
+ --assessment-ids data/embeddings_bge/assessment_ids.json \
114
+ --embedding-model BAAI/bge-small-en-v1.5 \
115
+ --topn 200 \
116
+ --use-rewriter --vocab data/catalog_role_vocab.json \
117
+ --out runs/candidate_coverage.jsonl
118
+
119
+ python -m scripts.summarize_candidate_coverage \
120
+ --input runs/candidate_coverage.jsonl \
121
+ --out runs/candidate_coverage_stats.json
122
+ ```
123
+
124
+ ### Rewrite impact (optional)
125
+ ```bash
126
+ python -m scripts.eval_rewrite_impact \
127
+ --catalog data/catalog_docs_rich.jsonl \
128
+ --train data/Gen_AI\ Dataset.xlsx \
129
+ --vector-index data/faiss_index/index_bge.faiss \
130
+ --assessment-ids data/embeddings_bge/assessment_ids.json \
131
+ --embedding-model BAAI/bge-small-en-v1.5 \
132
+ --topn 200 \
133
+ --vocab data/catalog_role_vocab.json \
134
+ --out runs/rewrite_impact.jsonl
135
+ ```
136
+
137
+ ## Frontend + backend (Next.js + FastAPI)
138
+
139
+ Backend (FastAPI):
140
+ - Start: `uvicorn agent.server:app --reload --port 8000`
141
+ - Health: `GET /health`
142
+ - Chat: `POST /chat` (returns compact top-10 + optional summary when verbose=true)
143
+ - Recommend: `POST /recommend` with `{"query": "..."}` returns `{"recommended_assessments": [...]}` (top-10)
144
+
145
+ Frontend (Next.js in `frontend/`):
146
+ - Install deps: `cd frontend && npm install`
147
+ - Dev: `npm run dev` (will start on port 3000; ensure backend is running on 8000 or set API base in UI)
148
+ - Build/start: `npm run build && npm run start`
149
+ - UI is at `http://localhost:3000/` (API base defaults to `http://localhost:8000`, editable in the UI)
agent/app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Lightweight agent harness (no LangChain server) to demonstrate the tool stack end-to-end.
5
+ This keeps ranking deterministic; LLM can be plugged later for structured QueryPlan.
6
+ """
7
+
8
+ import json
9
+ from typing import Callable
10
+
11
+ import pandas as pd
12
+
13
+ from data.catalog_loader import load_catalog
14
+ from recommenders.bm25 import BM25Recommender
15
+ from recommenders.vector_recommender import VectorRecommender
16
+ from retrieval.vector_index import VectorIndex
17
+ from models.embedding_model import EmbeddingModel
18
+ from rerankers.cross_encoder import CrossEncoderReranker
19
+
20
+ from tools.query_plan_tool import build_query_plan
21
+ from tools.retrieve_tool import retrieve_candidates
22
+ from tools.rerank_tool import rerank_candidates
23
+ from tools.constraints_tool import apply_constraints
24
+ from tools.explain_tool import explain
25
+
26
+
27
+ def load_resources():
28
+ df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
29
+ bm25 = BM25Recommender(df_catalog)
30
+ embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
31
+ index = VectorIndex.load("data/faiss_index/index_bge.faiss")
32
+ with open("data/embeddings_bge/assessment_ids.json") as f:
33
+ ids = json.load(f)
34
+ vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
35
+ return df_catalog, bm25, vec
36
+
37
+
38
+ def make_catalog_lookup(df_catalog: pd.DataFrame) -> Callable[[str], dict]:
39
+ cat = df_catalog.set_index("assessment_id")
40
+
41
+ def lookup(aid: str) -> dict:
42
+ if aid in cat.index:
43
+ return cat.loc[aid].to_dict()
44
+ return {}
45
+
46
+ return lookup
47
+
48
+
49
+ def run_query(user_text: str, vocab_path="data/catalog_role_vocab.json"):
50
+ vocab = json.load(open(vocab_path)) if vocab_path else {}
51
+ df_catalog, bm25, vec = load_resources()
52
+ catalog_lookup = make_catalog_lookup(df_catalog)
53
+
54
+ # Step 1: plan (deterministic rewriter for now; swap with LLM structured plan if desired)
55
+ plan = build_query_plan(user_text, vocab=vocab)
56
+
57
+ # Step 2: retrieve (union)
58
+ cand_set = retrieve_candidates(plan, bm25, vec, topn=200, catalog_df=df_catalog)
59
+
60
+ # Step 3: rerank (use best reranker)
61
+ reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
62
+ ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
63
+
64
+ # Step 4: constraints (hook; currently passthrough)
65
+ final_list = apply_constraints(plan, ranked)
66
+
67
+ # Step 5: explanation
68
+ summary = explain(plan, final_list, catalog_lookup)
69
+ return summary
70
+
71
+
72
+ if __name__ == "__main__":
73
+ import sys
74
+
75
+ user_text = " ".join(sys.argv[1:]) or "Find a 1 hour culture fit assessment for a COO"
76
+ print(run_query(user_text))
agent/chat_agent.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Chat-style agent using Gemini for planning + explanation, deterministic tools for retrieval/rerank.
5
+ Set GOOGLE_API_KEY in your environment.
6
+ """
7
+ import json
8
+ import os
9
+ from typing import Callable
10
+
11
+ import pandas as pd
12
+
13
+ from data.catalog_loader import load_catalog
14
+ from recommenders.bm25 import BM25Recommender
15
+ from recommenders.vector_recommender import VectorRecommender
16
+ from retrieval.vector_index import VectorIndex
17
+ from models.embedding_model import EmbeddingModel
18
+ from rerankers.cross_encoder import CrossEncoderReranker
19
+
20
+ from tools.query_plan_tool_llm import build_query_plan_llm
21
+ from tools.query_plan_tool import build_query_plan as deterministic_plan
22
+ from tools.retrieve_tool import retrieve_candidates
23
+ from tools.rerank_tool import rerank_candidates
24
+ from tools.constraints_tool import apply_constraints
25
+ from tools.explain_tool import explain
26
+ from schemas.query_plan import QueryPlan
27
+
28
+
29
+ def load_resources():
30
+ df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
31
+ bm25 = BM25Recommender(df_catalog)
32
+ embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
33
+ index = VectorIndex.load("data/faiss_index/index_bge.faiss")
34
+ with open("data/embeddings_bge/assessment_ids.json") as f:
35
+ ids = json.load(f)
36
+ vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
37
+ catalog_by_id = {row["assessment_id"]: row for _, row in df_catalog.iterrows()}
38
+ return df_catalog, bm25, vec, catalog_by_id
39
+
40
+
41
+ def make_catalog_lookup(df_catalog: pd.DataFrame) -> Callable[[str], dict]:
42
+ cat = df_catalog.set_index("assessment_id")
43
+
44
+ def lookup(aid: str) -> dict:
45
+ if aid in cat.index:
46
+ return cat.loc[aid].to_dict()
47
+ return {}
48
+
49
+ return lookup
50
+
51
+
52
+ def _maybe_clarify(plan: QueryPlan, cand_count: int, topn: int) -> str | None:
53
+ # LLM-flagged clarification
54
+ if plan.needs_clarification and plan.clarifying_question:
55
+ return plan.clarifying_question
56
+ # Coverage-based triggers
57
+ if cand_count < max(10, int(0.25 * topn)):
58
+ return "Results look thin. Clarify: are you looking for (1) personality/culture fit, (2) leadership judgment (SJT), or (3) role capability?"
59
+ if plan.intent in {"BEHAVIORAL", "UNKNOWN", "MIXED"} and cand_count < max(20, int(0.5 * topn)):
60
+ return "For culture/behavioral focus, choose: (1) personality/culture fit, (2) leadership judgment (SJT), or (3) role capability. Please pick one."
61
+ return None
62
+
63
+
64
+ def run_chat(
65
+ user_text: str,
66
+ vocab_path: str = "data/catalog_role_vocab.json",
67
+ model_name: str = "gemini-2.5-flash-lite",
68
+ clarification_answer: str | None = None,
69
+ topn: int = 200,
70
+ verbose: bool = False,
71
+ ):
72
+ vocab = json.load(open(vocab_path)) if vocab_path and os.path.exists(vocab_path) else {}
73
+ df_catalog, bm25, vec, catalog_by_id = load_resources()
74
+ catalog_lookup = make_catalog_lookup(df_catalog)
75
+
76
+ trace_id = f"trace-{abs(hash(user_text))}"
77
+ log = {"trace_id": trace_id, "raw_query": user_text}
78
+
79
+ # Plan with LLM; fallback deterministic if LLM fails
80
+ try:
81
+ plan = build_query_plan_llm(user_text, vocab=vocab, model_name=model_name)
82
+ QueryPlan.model_validate(plan.dict()) # schema guard
83
+ log["plan_source"] = "llm"
84
+ except Exception as e:
85
+ plan = deterministic_plan(user_text, vocab=vocab)
86
+ log["plan_source"] = f"deterministic (llm_fail={str(e)})"
87
+ log["query_plan"] = plan.dict()
88
+
89
+ # Retrieve union
90
+ cand_set = retrieve_candidates(plan, bm25, vec, topn=topn, catalog_df=df_catalog)
91
+ if verbose:
92
+ log["candidates"] = [c.model_dump() for c in cand_set.candidates[:10]]
93
+
94
+ # Clarification loop
95
+ question = _maybe_clarify(plan, cand_count=len(cand_set.candidates), topn=topn)
96
+ if question and not clarification_answer:
97
+ log["clarification"] = question
98
+ if verbose:
99
+ print(json.dumps(log, indent=2))
100
+ return f"Clarification needed: {question}"
101
+ if question and clarification_answer:
102
+ clarified_text = f"{user_text}\nUser clarification: {clarification_answer}"
103
+ try:
104
+ plan = build_query_plan_llm(clarified_text, vocab=vocab, model_name=model_name)
105
+ QueryPlan.model_validate(plan.dict())
106
+ except Exception:
107
+ plan = deterministic_plan(clarified_text, vocab=vocab)
108
+ log["query_plan_clarified"] = plan.dict()
109
+ cand_set = retrieve_candidates(plan, bm25, vec, topn=topn, catalog_df=df_catalog)
110
+ if verbose:
111
+ log["candidates_clarified"] = [c.model_dump() for c in cand_set.candidates[:10]]
112
+
113
+ # Rerank
114
+ reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
115
+ ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
116
+ log["rerank"] = [item.model_dump() for item in ranked.items]
117
+
118
+ # Constraints
119
+ final_list = apply_constraints(plan, ranked, catalog_by_id, k=10)
120
+ log["final"] = [item.model_dump() for item in final_list.items]
121
+
122
+ # Explain
123
+ summary = explain(plan, final_list, catalog_lookup)
124
+ log["summary"] = summary
125
+
126
+ # Compact output: top-10 with metadata
127
+ final_results = []
128
+ for item in final_list.items:
129
+ meta = catalog_lookup(item.assessment_id)
130
+ final_results.append(
131
+ {
132
+ "assessment_id": item.assessment_id,
133
+ "score": item.score,
134
+ "name": meta.get("name"),
135
+ "url": meta.get("url"),
136
+ "test_type_full": meta.get("test_type_full") or meta.get("test_type"),
137
+ "duration": meta.get("duration_minutes") or meta.get("duration"),
138
+ }
139
+ )
140
+
141
+ if verbose:
142
+ log["final_results"] = final_results
143
+ print(json.dumps(log, indent=2))
144
+ else:
145
+ print(json.dumps({"trace_id": trace_id, "final_results": final_results}, indent=2))
146
+
147
+ return summary
148
+
149
+
150
+ if __name__ == "__main__":
151
+ import sys
152
+
153
+ if "GOOGLE_API_KEY" not in os.environ:
154
+ print("Please set GOOGLE_API_KEY for Gemini.")
155
+ user_text = " ".join(sys.argv[1:]) or "Find a 1 hour culture fit assessment for a COO"
156
+ print(run_chat(user_text, verbose=False))
agent/router_agent.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Router-style agent (minimal, deterministic) that orchestrates the tool stack:
5
+ - build_query_plan
6
+ - retrieve_candidates
7
+ - rerank_candidates
8
+ - apply_constraints
9
+ - explain
10
+
11
+ This is intentionally simple and does not require an LLM. You can swap
12
+ build_query_plan with an LLM-based planner that emits the same QueryPlan schema.
13
+ """
14
+ import json
15
+ from typing import Callable
16
+
17
+ import pandas as pd
18
+
19
+ from data.catalog_loader import load_catalog
20
+ from recommenders.bm25 import BM25Recommender
21
+ from recommenders.vector_recommender import VectorRecommender
22
+ from retrieval.vector_index import VectorIndex
23
+ from models.embedding_model import EmbeddingModel
24
+ from rerankers.cross_encoder import CrossEncoderReranker
25
+
26
+ from tools.query_plan_tool import build_query_plan
27
+ from tools.retrieve_tool import retrieve_candidates
28
+ from tools.rerank_tool import rerank_candidates
29
+ from tools.constraints_tool import apply_constraints
30
+ from tools.explain_tool import explain
31
+
32
+
33
+ def load_resources():
34
+ df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
35
+ bm25 = BM25Recommender(df_catalog)
36
+ embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
37
+ index = VectorIndex.load("data/faiss_index/index_bge.faiss")
38
+ with open("data/embeddings_bge/assessment_ids.json") as f:
39
+ ids = json.load(f)
40
+ vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
41
+ return df_catalog, bm25, vec
42
+
43
+
44
+ def make_catalog_lookup(df_catalog: pd.DataFrame) -> Callable[[str], dict]:
45
+ cat = df_catalog.set_index("assessment_id")
46
+
47
+ def lookup(aid: str) -> dict:
48
+ if aid in cat.index:
49
+ return cat.loc[aid].to_dict()
50
+ return {}
51
+
52
+ return lookup
53
+
54
+
55
+ def route_query(user_text: str, vocab_path: str = "data/catalog_role_vocab.json") -> str:
56
+ vocab = json.load(open(vocab_path)) if vocab_path else {}
57
+ df_catalog, bm25, vec = load_resources()
58
+ catalog_lookup = make_catalog_lookup(df_catalog)
59
+
60
+ # 1) Plan (deterministic rewriter; swap with LLM-structured plan if desired)
61
+ plan = build_query_plan(user_text, vocab=vocab)
62
+
63
+ # 2) Clarification hook
64
+ # Placeholder: in an interactive app, if plan.needs_clarification or coverage is weak,
65
+ # ask a question and rebuild the plan with the user response.
66
+
67
+ # 3) Retrieve (union of BM25 + vector)
68
+ cand_set = retrieve_candidates(plan, bm25, vec, topn=200, catalog_df=df_catalog)
69
+
70
+ # 4) Rerank (cross-encoder)
71
+ reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
72
+ ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
73
+
74
+ # 5) Apply constraints (stub; extend for duration/remote/adaptive)
75
+ final_list = apply_constraints(plan, ranked)
76
+
77
+ # 6) Explain
78
+ summary = explain(plan, final_list, catalog_lookup)
79
+ return summary
80
+
81
+
82
+ if __name__ == "__main__":
83
+ import sys
84
+
85
+ user_text = " ".join(sys.argv[1:]) or "Find a 1 hour culture fit assessment for a COO"
86
+ print(route_query(user_text))
agent/server.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Minimal chat backend (FastAPI) that delegates to the agent app pipeline.
5
+
6
+ Run:
7
+ uvicorn agent.server:app --reload --port 8000
8
+ """
9
+
10
+ import uuid
11
+ import json
12
+ from typing import Optional, Callable
13
+ from collections import deque
14
+ import time
15
+ import math
16
+
17
+ from fastapi import FastAPI
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from fastapi.responses import FileResponse
20
+ from fastapi.staticfiles import StaticFiles
21
+ from pydantic import BaseModel
22
+
23
+ from functools import lru_cache
24
+ import os
25
+ from data.catalog_loader import load_catalog
26
+ from recommenders.bm25 import BM25Recommender
27
+ from recommenders.vector_recommender import VectorRecommender
28
+ from retrieval.vector_index import VectorIndex
29
+ from models.embedding_model import EmbeddingModel
30
+ from rerankers.cross_encoder import CrossEncoderReranker
31
+ from tools.query_plan_tool import build_query_plan
32
+ from tools.query_plan_tool_llm import build_query_plan_llm
33
+ from llm.nu_extract import NuExtractWrapper, default_query_rewrite_examples
34
+ from llm.qwen_rewriter import QwenRewriter
35
+ from tools.retrieve_tool import retrieve_candidates
36
+ from tools.rerank_tool import rerank_candidates
37
+ from tools.constraints_tool import apply_constraints
38
+
39
+
40
+ class ChatRequest(BaseModel):
41
+ query: str
42
+ clarification_answer: Optional[str] = None
43
+ verbose: bool = False
44
+
45
+ class RecommendRequest(BaseModel):
46
+ query: str
47
+ llm_model: Optional[str] = None
48
+ verbose: bool = False
49
+
50
+
51
+ def _make_catalog_lookup(df_catalog) -> Callable[[str], dict]:
52
+ cat = df_catalog.set_index("assessment_id")
53
+ def lookup(aid: str) -> dict:
54
+ if aid in cat.index:
55
+ return cat.loc[aid].to_dict()
56
+ return {}
57
+ return lookup
58
+
59
+
60
+ @lru_cache(maxsize=1)
61
+ def load_resources(llm_model_override: Optional[str] = None):
62
+ df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
63
+ bm25 = BM25Recommender(df_catalog)
64
+ embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
65
+ index = VectorIndex.load("data/faiss_index/index_bge.faiss")
66
+ with open("data/embeddings_bge/assessment_ids.json") as f:
67
+ ids = json.load(f)
68
+ vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
69
+ reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
70
+ lookup = _make_catalog_lookup(df_catalog)
71
+ catalog_by_id = {row["assessment_id"]: row for _, row in df_catalog.iterrows()}
72
+ vocab = {}
73
+ vocab_path = "data/catalog_role_vocab.json"
74
+ if os.path.exists(vocab_path):
75
+ try:
76
+ with open(vocab_path) as vf:
77
+ vocab = json.load(vf)
78
+ except Exception:
79
+ vocab = {}
80
+ # Optional LLM rewriter; choose via request override or env LLM_MODEL
81
+ llm_extractor = None
82
+ llm_model = llm_model_override or os.getenv("LLM_MODEL", "").strip()
83
+ if not llm_model:
84
+ llm_model = "Qwen/Qwen2.5-1.5B-Instruct"
85
+ try:
86
+ if llm_model.lower().startswith("qwen"):
87
+ llm_extractor = QwenRewriter(model_name=llm_model, default_examples=default_query_rewrite_examples())
88
+ elif not os.getenv("GOOGLE_API_KEY"):
89
+ llm_extractor = NuExtractWrapper(default_examples=default_query_rewrite_examples())
90
+ except Exception:
91
+ llm_extractor = None
92
+ return df_catalog, bm25, vec, reranker, lookup, vocab, llm_extractor, catalog_by_id
93
+
94
+
95
+ def _infer_remote_adaptive(meta: dict) -> (Optional[bool], Optional[bool]):
96
+ remote = meta.get("remote_support", True if meta.get("remote_support") is None else meta.get("remote_support"))
97
+ adaptive = meta.get("adaptive_support")
98
+ text_blob = " ".join([str(meta.get("name", "")), str(meta.get("description", "")), str(meta.get("doc_text", ""))]).lower()
99
+ if adaptive is None and "adaptive" in text_blob:
100
+ adaptive = True
101
+ return remote, adaptive
102
+
103
+
104
+ def _build_plan_with_fallback(query: str, vocab: dict, llm_extractor):
105
+ """
106
+ Build the query plan using the LLM rewriter (Qwen) when available, otherwise
107
+ fall back to deterministic rewrite. No Gemini refinement to keep behavior predictable.
108
+ """
109
+ try:
110
+ return build_query_plan(query, vocab=vocab, llm_extractor=llm_extractor)
111
+ except Exception:
112
+ return build_query_plan(query, vocab=vocab)
113
+
114
+
115
+ def _safe_num(val):
116
+ try:
117
+ if val is None:
118
+ return None
119
+ f = float(val)
120
+ if math.isfinite(f):
121
+ return f
122
+ except Exception:
123
+ return None
124
+ return None
125
+
126
+
127
+ def _sanitize_debug(obj):
128
+ """Recursively replace NaN/inf with None to keep JSON safe."""
129
+ if isinstance(obj, dict):
130
+ return {k: _sanitize_debug(v) for k, v in obj.items()}
131
+ if isinstance(obj, list):
132
+ return [_sanitize_debug(v) for v in obj]
133
+ if isinstance(obj, tuple):
134
+ return tuple(_sanitize_debug(v) for v in obj)
135
+ if isinstance(obj, (int, float)):
136
+ return _safe_num(obj)
137
+ return obj
138
+
139
+
140
+ CODE_TO_FULL = {
141
+ "A": "Ability & Aptitude",
142
+ "B": "Biodata & Situational Judgement",
143
+ "C": "Competencies",
144
+ "D": "Development & 360",
145
+ "E": "Assessment Exercises",
146
+ "K": "Knowledge & Skills",
147
+ "P": "Personality & Behavior",
148
+ "S": "Simulations",
149
+ }
150
+
151
+
152
+ def _format_test_types(meta: dict) -> list[str]:
153
+ if meta.get("test_type_full"):
154
+ raw = meta["test_type_full"]
155
+ elif meta.get("test_type"):
156
+ raw = meta["test_type"]
157
+ else:
158
+ return []
159
+ if isinstance(raw, list):
160
+ vals = raw
161
+ else:
162
+ vals = str(raw).replace("/", ",").split(",")
163
+ out = []
164
+ for v in vals:
165
+ v = v.strip()
166
+ if not v:
167
+ continue
168
+ # Map letter codes to full names when applicable
169
+ if len(v) == 1 and v in CODE_TO_FULL:
170
+ out.append(CODE_TO_FULL[v])
171
+ else:
172
+ out.append(v)
173
+ return out
174
+
175
+
176
+ def _run_pipeline(query: str, topn: int = 200, verbose: bool = False, llm_model: Optional[str] = None):
177
+ if verbose:
178
+ # For debugging, bypass cached resources to ensure fresh state
179
+ load_resources.cache_clear()
180
+ df_catalog, bm25, vec, reranker, lookup, vocab, llm_extractor, catalog_by_id = load_resources(llm_model_override=llm_model)
181
+ plan = _build_plan_with_fallback(query, vocab=vocab, llm_extractor=llm_extractor)
182
+ cand_set = retrieve_candidates(plan, bm25, vec, topn=topn, catalog_df=df_catalog)
183
+ ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
184
+ final_list = apply_constraints(plan, ranked, catalog_by_id, k=10)
185
+
186
+ debug_payload = {}
187
+ if verbose:
188
+ debug_payload["plan"] = plan.dict()
189
+ # If plan carries a source (from planner), include it
190
+ if hasattr(plan, "plan_source"):
191
+ debug_payload["plan_source"] = getattr(plan, "plan_source")
192
+ # Capture NuExtract LLM debug if present
193
+ if hasattr(plan, "llm_debug") and plan.llm_debug:
194
+ debug_payload["llm_debug"] = plan.llm_debug
195
+ if hasattr(cand_set, "fusion") and cand_set.fusion:
196
+ debug_payload["fusion"] = cand_set.fusion
197
+ debug_payload["candidates"] = [
198
+ {
199
+ "assessment_id": c.assessment_id,
200
+ "bm25_rank": c.bm25_rank,
201
+ "vector_rank": c.vector_rank,
202
+ "hybrid_rank": c.hybrid_rank,
203
+ "bm25_score": _safe_num(c.bm25_score),
204
+ "vector_score": _safe_num(c.vector_score),
205
+ "score": _safe_num(c.score),
206
+ }
207
+ for c in cand_set.candidates[: min(20, len(cand_set.candidates))]
208
+ ]
209
+ debug_payload["rerank"] = [
210
+ {"assessment_id": r.assessment_id, "score": _safe_num(r.score)}
211
+ for r in ranked.items[: min(20, len(ranked.items))]
212
+ ]
213
+ debug_payload["constraints"] = [
214
+ {
215
+ "assessment_id": r.assessment_id,
216
+ "score": _safe_num(r.score),
217
+ "debug": r.debug,
218
+ }
219
+ for r in final_list.items
220
+ ]
221
+
222
+ final_results = []
223
+ for item in final_list.items:
224
+ meta = lookup(item.assessment_id)
225
+ remote, adaptive = _infer_remote_adaptive(meta)
226
+ score = _safe_num(item.score)
227
+ duration = _safe_num(meta.get("duration_minutes") or meta.get("duration"))
228
+ duration_int = int(duration) if duration is not None else None
229
+ description = meta.get("description") or meta.get("doc_text") or ""
230
+ test_types = _format_test_types(meta)
231
+ final_results.append(
232
+ {
233
+ "url": meta.get("url"),
234
+ "name": meta.get("name"),
235
+ "adaptive_support": "Yes" if adaptive else "No",
236
+ "description": description,
237
+ "duration": duration_int if duration_int is not None else 0,
238
+ "remote_support": "Yes" if remote else "No",
239
+ "test_type": test_types,
240
+ }
241
+ )
242
+ # Guarantee at least one result if pipeline produced candidates
243
+ if not final_results and ranked.items:
244
+ item = ranked.items[0]
245
+ meta = lookup(item.assessment_id)
246
+ remote, adaptive = _infer_remote_adaptive(meta)
247
+ duration = _safe_num(meta.get("duration_minutes") or meta.get("duration"))
248
+ duration_int = int(duration) if duration is not None else 0
249
+ final_results.append(
250
+ {
251
+ "url": meta.get("url"),
252
+ "name": meta.get("name"),
253
+ "adaptive_support": "Yes" if adaptive else "No",
254
+ "description": meta.get("description") or meta.get("doc_text") or "",
255
+ "duration": duration_int,
256
+ "remote_support": "Yes" if remote else "No",
257
+ "test_type": _format_test_types(meta),
258
+ }
259
+ )
260
+ summary = {"plan": plan.intent, "top": len(final_results)}
261
+ return final_results, summary, debug_payload
262
+
263
+
264
+ app = FastAPI()
265
+ app.add_middleware(
266
+ CORSMiddleware,
267
+ allow_origins=["*"],
268
+ allow_credentials=False, # '*' cannot be used with credentials
269
+ allow_methods=["*"],
270
+ allow_headers=["*"],
271
+ )
272
+
273
+ # Serve frontend assets
274
+ app.mount("/static", StaticFiles(directory="frontend"), name="static")
275
+
276
+ # Simple in-process rate limiter (max 5 requests per second)
277
+ _timestamps = deque()
278
+ _RATE_LIMIT = 5
279
+ _WINDOW = 1.0
280
+
281
+ def _allow_request() -> bool:
282
+ now = time.time()
283
+ while _timestamps and now - _timestamps[0] > _WINDOW:
284
+ _timestamps.popleft()
285
+ if len(_timestamps) < _RATE_LIMIT:
286
+ _timestamps.append(now)
287
+ return True
288
+ return False
289
+
290
+
291
+ @app.post("/chat")
292
+ def chat(req: ChatRequest):
293
+ if not _allow_request():
294
+ return {"error": "rate limit exceeded"}
295
+ trace_id = str(uuid.uuid4())
296
+ final_results, summary, debug_payload = _run_pipeline(req.query, verbose=req.verbose)
297
+ payload = {"trace_id": trace_id, "final_results": final_results}
298
+ if req.verbose:
299
+ payload["summary"] = summary
300
+ payload["debug"] = _sanitize_debug(debug_payload)
301
+ return payload
302
+
303
+
304
+ @app.post("/recommend")
305
+ def recommend(req: RecommendRequest):
306
+ if not _allow_request():
307
+ return {"error": "rate limit exceeded"}
308
+ final_results, summary, debug_payload = _run_pipeline(req.query, verbose=req.verbose, llm_model=req.llm_model)
309
+ resp = {"recommended_assessments": final_results}
310
+ if req.verbose:
311
+ resp["debug"] = _sanitize_debug(debug_payload)
312
+ resp["summary"] = summary
313
+ return resp
314
+
315
+
316
+ @app.get("/health")
317
+ def health():
318
+ return {"status": "ok"}
319
+
320
+
321
+ @app.get("/")
322
+ def index():
323
+ # Serve the SPA entry point
324
+ return FileResponse("frontend/index.html")
api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API surface (REST/gRPC/WebSocket) for serving recommendations."""
config.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Centralized config loader using YAML with ENV overrides.
2
+
3
+ Environment variables prefixed with ``LRE_`` can override nested keys using
4
+ double-underscores, e.g. ``LRE_APP__LOG_LEVEL=DEBUG``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import os
11
+ import pathlib
12
+ from typing import Any, Dict
13
+
14
+ import yaml
15
+
16
+ DEFAULT_CONFIG_PATH = pathlib.Path(os.environ.get("CONFIG_PATH", "configs/config.yaml"))
17
+ ENV_PREFIX = "LRE_"
18
+
19
+
20
+ def _parse_env_value(value: str) -> Any:
21
+ """Best-effort casting for env values."""
22
+ lowered = value.lower()
23
+ if lowered in {"true", "false"}:
24
+ return lowered == "true"
25
+ try:
26
+ return int(value)
27
+ except ValueError:
28
+ pass
29
+ try:
30
+ return float(value)
31
+ except ValueError:
32
+ pass
33
+ return value
34
+
35
+
36
+ def _set_nested(config: Dict[str, Any], path: list[str], value: Any) -> None:
37
+ cursor = config
38
+ for part in path[:-1]:
39
+ cursor = cursor.setdefault(part, {})
40
+ cursor[path[-1]] = value
41
+
42
+
43
+ def apply_env_overrides(config: Dict[str, Any], prefix: str = ENV_PREFIX) -> Dict[str, Any]:
44
+ """Apply ENV overrides in-place and return config."""
45
+ for key, raw_value in os.environ.items():
46
+ if not key.startswith(prefix):
47
+ continue
48
+ path = key[len(prefix) :].lower().split("__")
49
+ _set_nested(config, path, _parse_env_value(raw_value))
50
+ return config
51
+
52
+
53
+ def load_config(config_path: pathlib.Path | str | None = None) -> Dict[str, Any]:
54
+ """Load YAML config and apply ENV overrides."""
55
+ path = pathlib.Path(config_path or DEFAULT_CONFIG_PATH)
56
+ with path.open() as f:
57
+ config: Dict[str, Any] = yaml.safe_load(f) or {}
58
+ return apply_env_overrides(config)
59
+
60
+
61
+ def _cli() -> None:
62
+ parser = argparse.ArgumentParser(description="Config loader helper")
63
+ parser.add_argument(
64
+ "--print", dest="print_config", action="store_true", help="Print resolved config"
65
+ )
66
+ parser.add_argument("--path", dest="config_path", type=str, help="Optional config path")
67
+ args = parser.parse_args()
68
+
69
+ cfg = load_config(args.config_path)
70
+ if args.print_config:
71
+ print(yaml.dump(cfg, sort_keys=False))
72
+
73
+
74
+ if __name__ == "__main__":
75
+ _cli()
configs/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Config package placeholder."""
configs/config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app:
2
+ name: llm-recommendation-engine
3
+ environment: local
4
+ log_level: INFO
5
+
6
+ data:
7
+ raw_dir: data/raw
8
+ processed_dir: data/processed
9
+ cache_dir: data/cache
10
+
11
+ index:
12
+ type: faiss
13
+ dim: 384
14
+ store_path: data/index/faiss.index
15
+
16
+ models:
17
+ embedder: sentence-transformers/all-MiniLM-L6-v2
18
+ reranker: cross-encoder/ms-marco-MiniLM-L-6-v2
19
+
20
+ services:
21
+ api:
22
+ host: 0.0.0.0
23
+ port: 8000
24
+ ui:
25
+ host: 0.0.0.0
26
+ port: 3000
27
+
28
+ observability:
29
+ tracing_enabled: false
30
+ metrics_endpoint: /metrics
31
+
32
+ storage:
33
+ bucket: s3://placeholder-bucket
34
+ prefix: recommendations
35
+
36
+ crawler:
37
+ start_url: https://www.shl.com/products/product-catalog/
38
+ user_agent: llm-recommendation-engine/0.1 (+https://example.com)
39
+ max_concurrency: 2
40
+ request_delay_seconds: 1.5
41
+ jitter_seconds: 0.5
42
+ max_retries: 3
43
+ sqlite_path: data/crawler.db
configs/embedding_config.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: sentence-transformers/all-MiniLM-L6-v2
3
+ cache_dir: .model_cache
4
+ device: cpu
5
+ normalize_embeddings: true
6
+ batch_size: 32
7
+
8
+ preprocessing:
9
+ max_length: 512
10
+ padding: false
configs/retrieval.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ retrieval:
2
+ method: hybrid_rrf
3
+ train_topn_candidates: 200
4
+ infer_topn_candidates: 100
5
+ rrf_k: 60
crawler/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Data ingestion and crawling utilities."""
crawler/backfill_labels.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import csv
6
+ import os
7
+ from pathlib import Path
8
+
9
+ import structlog
10
+
11
+ from config import load_config
12
+ from crawler.fetcher import PlaywrightFetcher
13
+ from crawler.parser_detail import parse_detail_page
14
+ from crawler.robots import RobotsManager
15
+ from crawler.storage import PAGE_TYPE_DETAIL, PARSE_PARSED, PageRecord, Storage
16
+ from crawler.utils import RateLimiter
17
+
18
+ logger = structlog.get_logger(__name__)
19
+
20
+
21
+ async def backfill_from_probe(probe_csv: str, storage: Storage, fetcher: PlaywrightFetcher, robots: RobotsManager, allow_bypass: bool):
22
+ with open(probe_csv) as f:
23
+ reader = csv.DictReader(f)
24
+ rows = [row for row in reader if row.get("classification") == "DETAIL_PAGE_VALID"]
25
+ logger.info("backfill.labels.start", count=len(rows))
26
+ for row in rows:
27
+ url = row["url"]
28
+ allowed = allow_bypass or robots.is_allowed(url)
29
+ if not allowed:
30
+ logger.warning("backfill.detail.disallowed", url=url)
31
+ continue
32
+ if allow_bypass:
33
+ logger.warning("backfill.detail.disallowed.bypassed", url=url)
34
+ result = await fetcher.fetch(url, page_type=PAGE_TYPE_DETAIL)
35
+ storage.upsert_page(result.record)
36
+ if result.error or not result.html:
37
+ logger.error("backfill.detail.fetch_failed", url=url, error=result.error)
38
+ continue
39
+ parse_detail_page(result.html, url=url, storage=storage)
40
+ storage.update_parse_status(url, PARSE_PARSED)
41
+
42
+
43
+ def main():
44
+ parser = argparse.ArgumentParser(description="Backfill assessments from probed label URLs")
45
+ parser.add_argument("--probe-csv", required=True, help="CSV from scripts/probe_unmatched_labels.py")
46
+ parser.add_argument("--config", type=str, default=os.environ.get("CONFIG_PATH", "configs/config.yaml"))
47
+ parser.add_argument("--sqlite", type=str, default="data/crawler.db")
48
+ parser.add_argument("--allow-robots-bypass", action="store_true", help="Bypass robots.txt disallow (use responsibly)")
49
+ args = parser.parse_args()
50
+
51
+ config = load_config(args.config)
52
+ rate_limiter = RateLimiter(
53
+ base_delay=float(os.environ.get("REQUEST_DELAY_SECONDS", config.get("crawler", {}).get("request_delay_seconds", 1.5))),
54
+ jitter=float(os.environ.get("JITTER_SECONDS", config.get("crawler", {}).get("jitter_seconds", 0.5))),
55
+ )
56
+ user_agent = os.environ.get("USER_AGENT", config.get("crawler", {}).get("user_agent"))
57
+ max_retries = int(os.environ.get("MAX_RETRIES", config.get("crawler", {}).get("max_retries", 3)))
58
+
59
+ storage = Storage(args.sqlite)
60
+ robots = RobotsManager(robots_url="https://www.shl.com/robots.txt", user_agent=user_agent)
61
+ robots.load()
62
+
63
+ async def _runner():
64
+ async with PlaywrightFetcher(user_agent=user_agent, rate_limiter=rate_limiter, max_retries=max_retries) as fetcher:
65
+ await backfill_from_probe(args.probe_csv, storage, fetcher, robots, allow_bypass=args.allow_robots_bypass)
66
+
67
+ asyncio.run(_runner())
68
+ logger.info("backfill.labels.done")
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
crawler/export.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import pandas as pd
8
+ import structlog
9
+
10
+ from crawler.storage import Storage
11
+ from crawler.utils import now_iso
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+
16
+ def _normalize_row(row) -> dict:
17
+ downloads = row["downloads"]
18
+ if isinstance(downloads, str):
19
+ try:
20
+ downloads = json.loads(downloads)
21
+ except Exception:
22
+ downloads = None
23
+ job_levels = row["job_levels"]
24
+ if isinstance(job_levels, str):
25
+ try:
26
+ job_levels = json.loads(job_levels)
27
+ except Exception:
28
+ job_levels = [j.strip() for j in job_levels.split(",") if j.strip()]
29
+ languages = row.get("languages")
30
+ if isinstance(languages, str):
31
+ try:
32
+ languages = json.loads(languages)
33
+ except Exception:
34
+ languages = [l.strip() for l in languages.split(",") if l.strip()]
35
+ duration_minutes = row["duration_minutes"]
36
+ duration_hours = None
37
+ if duration_minutes is not None:
38
+ try:
39
+ duration_hours = float(duration_minutes) / 60.0
40
+ except Exception:
41
+ duration_hours = None
42
+ return {
43
+ "url": row["url"],
44
+ "name": row["name"],
45
+ "description": row["description"],
46
+ "test_type": row["test_type"],
47
+ "test_type_full": row.get("test_type_full"),
48
+ "remote_support": bool(row["remote_support"]) if row["remote_support"] is not None else None,
49
+ "adaptive_support": bool(row["adaptive_support"]) if row["adaptive_support"] is not None else None,
50
+ "duration": duration_minutes,
51
+ "duration_hours": duration_hours,
52
+ "job_levels": job_levels,
53
+ "languages": languages,
54
+ "downloads": downloads,
55
+ "source": "shl_product_catalog",
56
+ "crawled_at": now_iso(),
57
+ }
58
+
59
+
60
+ def export_catalog(
61
+ storage: Storage,
62
+ parquet_path: str,
63
+ jsonl_path: Optional[str] = None,
64
+ min_count: int = 377,
65
+ limit: Optional[int] = None,
66
+ ) -> None:
67
+ rows = storage.fetch_assessments()
68
+ logger.info("export.assessments.fetched", count=len(rows))
69
+
70
+ if len(rows) < min_count:
71
+ raise RuntimeError(f"Validation failed: expected at least {min_count} assessments, got {len(rows)}")
72
+
73
+ records = [_normalize_row(dict(r)) for r in rows]
74
+ df = pd.DataFrame.from_records(records)
75
+ if limit:
76
+ df = df.head(limit)
77
+ logger.info("export.limit.applied", limit=limit, rows=len(df))
78
+
79
+ Path(parquet_path).parent.mkdir(parents=True, exist_ok=True)
80
+ df.to_parquet(parquet_path, index=False)
81
+ logger.info("export.parquet.write", path=parquet_path, rows=len(df))
82
+
83
+ if jsonl_path:
84
+ df.to_json(jsonl_path, orient="records", lines=True, force_ascii=False)
85
+ logger.info("export.jsonl.write", path=jsonl_path, rows=len(df))
86
+
87
+ missing_desc = df["description"].isna().sum()
88
+ missing_duration = df["duration"].isna().sum()
89
+ logger.info(
90
+ "export.summary",
91
+ missing_description=missing_desc,
92
+ missing_duration=missing_duration,
93
+ test_type_counts=df["test_type"].value_counts(dropna=False).to_dict(),
94
+ )
crawler/fetcher.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from dataclasses import dataclass
5
+ from typing import Optional
6
+
7
+ import structlog
8
+ from playwright.async_api import async_playwright, Browser, Page
9
+
10
+ from crawler.storage import PageRecord
11
+ from crawler.utils import RateLimiter
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class FetchResult:
18
+ record: PageRecord
19
+ status: Optional[int]
20
+ html: Optional[str]
21
+ error: Optional[str]
22
+
23
+
24
+ class PlaywrightFetcher:
25
+ """Thin wrapper around Playwright with polite rate limiting."""
26
+
27
+ def __init__(
28
+ self,
29
+ user_agent: str,
30
+ rate_limiter: RateLimiter,
31
+ max_retries: int = 3,
32
+ ) -> None:
33
+ self.user_agent = user_agent
34
+ self.rate_limiter = rate_limiter
35
+ self.max_retries = max_retries
36
+ self._playwright = None
37
+ self._browser: Optional[Browser] = None
38
+ self._page: Optional[Page] = None
39
+
40
+ async def __aenter__(self) -> "PlaywrightFetcher":
41
+ await self.start()
42
+ return self
43
+
44
+ async def __aexit__(self, exc_type, exc, tb) -> None:
45
+ await self.close()
46
+
47
+ async def start(self) -> None:
48
+ if self._page:
49
+ return
50
+ self._playwright = await async_playwright().start()
51
+ self._browser = await self._playwright.chromium.launch(headless=True)
52
+ context = await self._browser.new_context(user_agent=self.user_agent)
53
+ self._page = await context.new_page()
54
+ logger.info("fetcher.started", user_agent=self.user_agent)
55
+
56
+ async def close(self) -> None:
57
+ if self._browser:
58
+ await self._browser.close()
59
+ if self._playwright:
60
+ await self._playwright.stop()
61
+ self._browser = None
62
+ self._page = None
63
+ logger.info("fetcher.closed")
64
+
65
+ async def fetch(self, url: str, page_type: str) -> FetchResult:
66
+ assert self._page, "Fetcher must be started before fetch()"
67
+ attempt = 0
68
+ last_error: Optional[str] = None
69
+ html: Optional[str] = None
70
+ status: Optional[int] = None
71
+
72
+ while attempt < self.max_retries:
73
+ attempt += 1
74
+ self.rate_limiter.sleep()
75
+ logger.info("fetcher.request", url=url, attempt=attempt)
76
+ try:
77
+ response = await self._page.goto(url, wait_until="networkidle", timeout=20000)
78
+ status = response.status if response else None
79
+ html = await self._page.content()
80
+ return FetchResult(
81
+ record=PageRecord(url=url, page_type=page_type, http_status=status, html=html),
82
+ status=status,
83
+ html=html,
84
+ error=None,
85
+ )
86
+ except Exception as exc: # pragma: no cover - network variability
87
+ last_error = str(exc)
88
+ logger.warning("fetcher.request.error", url=url, error=last_error, attempt=attempt)
89
+ return FetchResult(
90
+ record=PageRecord(url=url, page_type=page_type, http_status=status, html=html, error=last_error),
91
+ status=status,
92
+ html=html,
93
+ error=last_error,
94
+ )
95
+
96
+
97
+ def fetch_sync(url: str, page_type: str, user_agent: str, rate_limiter: RateLimiter, max_retries: int = 3) -> FetchResult:
98
+ async def _runner():
99
+ async with PlaywrightFetcher(user_agent=user_agent, rate_limiter=rate_limiter, max_retries=max_retries) as fetcher:
100
+ return await fetcher.fetch(url, page_type)
101
+
102
+ return asyncio.run(_runner())
crawler/parser_catalog.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Tuple
4
+ from urllib.parse import urljoin
5
+
6
+ import structlog
7
+ from bs4 import BeautifulSoup
8
+
9
+ from crawler.storage import (
10
+ PAGE_TYPE_DETAIL,
11
+ PARSE_PARSED,
12
+ PageRecord,
13
+ Storage,
14
+ )
15
+ from crawler.utils import canonicalize_url, now_iso
16
+
17
+ logger = structlog.get_logger(__name__)
18
+
19
+ ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"}
20
+ GREEN_TOKENS = ["green", "#8ac640", "rgb(138", "rgb(103", "0, 167, 83", "8ac640"]
21
+
22
+
23
+ def _has_green_indicator(cell) -> bool:
24
+ for el in cell.find_all(True):
25
+ style = (el.get("style") or "").lower()
26
+ classes = " ".join(el.get("class", [])).lower() if isinstance(el.get("class"), list) else str(el.get("class") or "").lower()
27
+ combined = f"{style} {classes}"
28
+ if any(tok in combined for tok in GREEN_TOKENS):
29
+ return True
30
+ if "-yes" in classes or "catalogue__circle" in classes:
31
+ return True
32
+ fill = (el.get("fill") or "").lower()
33
+ if any(tok in fill for tok in GREEN_TOKENS):
34
+ return True
35
+ # Generic icon/dot detection (when color is applied via CSS)
36
+ if el.name in {"svg", "circle", "path", "i"}:
37
+ return True
38
+ if "dot" in classes or "indicator" in classes:
39
+ return True
40
+ return False
41
+
42
+
43
+ def extract_catalog_entries(html: str) -> List[dict]:
44
+ """Parse catalog page for individual test solutions.
45
+
46
+ This is intentionally defensive; selectors may change on shl.com. We look for anchors within
47
+ sections that mention "Individual Test Solutions" or tables with product rows.
48
+ """
49
+ soup = BeautifulSoup(html, "lxml")
50
+ entries = []
51
+
52
+ tables = soup.find_all("table")
53
+ for table in tables:
54
+ headers = " ".join(th.get_text(" ", strip=True) for th in table.find_all("th"))
55
+ if "Individual Test Solutions" not in headers and "Assessment" not in headers:
56
+ continue
57
+ for row in table.find_all("tr"):
58
+ link = row.find("a", href=True)
59
+ if not link:
60
+ continue
61
+ name = link.get_text(strip=True)
62
+ detail_url = link["href"]
63
+ badges_text = [span.get_text("", strip=True) for span in row.find_all("span")]
64
+ test_letters = []
65
+ for token in badges_text:
66
+ token = token.strip()
67
+ if len(token) == 1 and token in ALLOWED_TEST_TYPES:
68
+ test_letters.append(token)
69
+ test_type = ",".join(dict.fromkeys(test_letters)) or None
70
+ tds = row.find_all("td")
71
+ remote = None
72
+ adaptive = None
73
+ if len(tds) >= 3:
74
+ remote = _has_green_indicator(tds[1])
75
+ adaptive = _has_green_indicator(tds[2])
76
+ else:
77
+ flat_badges = " ".join(badges_text).lower()
78
+ remote = "remote" in flat_badges
79
+ adaptive = "adaptive" in flat_badges or "irt" in flat_badges
80
+ entries.append(
81
+ {
82
+ "name": name,
83
+ "url": detail_url,
84
+ "test_type": test_type or None,
85
+ "remote_support": remote if remote else None,
86
+ "adaptive_support": adaptive if adaptive else None,
87
+ }
88
+ )
89
+ return entries
90
+
91
+
92
+ def find_next_pages(html: str, source_url: str) -> List[str]:
93
+ """Find pagination links (Next or numbered) and resolve to absolute URLs."""
94
+ soup = BeautifulSoup(html, "lxml")
95
+ urls = []
96
+ for link in soup.find_all("a", href=True):
97
+ text = link.get_text(" ", strip=True).lower()
98
+ if "next" in text or text.isdigit():
99
+ urls.append(canonicalize_url(urljoin(source_url, link["href"])))
100
+ # de-duplicate while preserving order
101
+ seen = set()
102
+ deduped = []
103
+ for u in urls:
104
+ if u not in seen:
105
+ seen.add(u)
106
+ deduped.append(u)
107
+ return deduped
108
+
109
+
110
+ def parse_catalog_page(html: str, source_url: str, storage: Storage) -> Tuple[int, List[str], List[str]]:
111
+ entries = extract_catalog_entries(html)
112
+ discovered_urls: List[str] = []
113
+
114
+ for entry in entries:
115
+ detail_url = canonicalize_url(urljoin(source_url, entry["url"]))
116
+ discovered_urls.append(detail_url)
117
+ storage.upsert_page(
118
+ PageRecord(
119
+ url=detail_url,
120
+ page_type=PAGE_TYPE_DETAIL,
121
+ )
122
+ )
123
+ storage.upsert_assessment(
124
+ {
125
+ "url": detail_url,
126
+ "name": entry.get("name"),
127
+ "test_type": entry.get("test_type"),
128
+ "remote_support": entry.get("remote_support"),
129
+ "adaptive_support": entry.get("adaptive_support"),
130
+ "source_catalog_page": canonicalize_url(source_url),
131
+ "discovered_at": now_iso(),
132
+ }
133
+ )
134
+
135
+ storage.update_parse_status(source_url, PARSE_PARSED)
136
+ next_pages = find_next_pages(html, source_url)
137
+ logger.info(
138
+ "catalog.parse.summary",
139
+ source_url=source_url,
140
+ discovered=len(discovered_urls),
141
+ next_pages=len(next_pages),
142
+ )
143
+ return len(entries), discovered_urls, next_pages
crawler/parser_detail.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Dict, Iterable, List, Optional
5
+
6
+ import structlog
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+ from crawler.storage import PARSE_PARSED, Storage
10
+ from crawler.utils import canonicalize_url, now_iso
11
+
12
+ logger = structlog.get_logger(__name__)
13
+
14
+ ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"}
15
+ STOP_LABELS = [
16
+ "Job levels",
17
+ "Job level",
18
+ "Languages",
19
+ "Language",
20
+ "Assessment length",
21
+ "Assessment Length",
22
+ "Test Type",
23
+ "Remote Testing",
24
+ "Adaptive/IRT",
25
+ "Adaptive",
26
+ "Downloads",
27
+ ]
28
+ STOP_LABELS_LOWER = [s.lower() for s in STOP_LABELS]
29
+ TEST_TYPE_LABELS = {
30
+ "A": "Ability & Aptitude",
31
+ "B": "Biodata & Situational Judgement",
32
+ "C": "Competencies",
33
+ "D": "Development & 360",
34
+ "E": "Assessment Exercises",
35
+ "K": "Knowledge & Skills",
36
+ "P": "Personality & Behavior",
37
+ "S": "Simulations",
38
+ }
39
+
40
+
41
+ def _normalize(text: str) -> str:
42
+ return re.sub(r"\s+", " ", (text or "")).strip()
43
+
44
+
45
+ def _extract_text(soup: BeautifulSoup, selector: str) -> Optional[str]:
46
+ node = soup.select_one(selector)
47
+ if not node:
48
+ return None
49
+ text = _normalize(node.get_text(" ", strip=True))
50
+ return text or None
51
+
52
+
53
+ def _find_label_node(soup: BeautifulSoup, label: str) -> Optional[Tag]:
54
+ label_l = label.lower()
55
+ candidates = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "div", "span", "strong", "dt", "th", "li"])
56
+ for node in candidates:
57
+ txt = _normalize(node.get_text(" ", strip=True)).lower()
58
+ if txt == label_l or txt.startswith(label_l + ":") or txt.startswith(label_l):
59
+ return node
60
+ for node in candidates:
61
+ txt = _normalize(node.get_text(" ", strip=True)).lower()
62
+ if re.search(rf"\b{re.escape(label_l)}\b", txt):
63
+ return node
64
+ return None
65
+
66
+
67
+ def _extract_section_until(soup: BeautifulSoup, start_label: str, stop_labels: Iterable[str]) -> Optional[str]:
68
+ start = _find_label_node(soup, start_label)
69
+ if not start:
70
+ return None
71
+
72
+ chunks: List[str] = []
73
+
74
+ start_txt = _normalize(start.get_text(" ", strip=True))
75
+ if re.match(rf"^{re.escape(start_label)}\s*:", start_txt, flags=re.I):
76
+ after = re.split(rf"^{re.escape(start_label)}\s*:\s*", start_txt, flags=re.I)[-1]
77
+ if after:
78
+ chunks.append(after)
79
+
80
+ for node in start.find_all_next():
81
+ if node == start:
82
+ continue
83
+ if not isinstance(node, Tag):
84
+ continue
85
+
86
+ node_txt = _normalize(node.get_text(" ", strip=True))
87
+ if not node_txt:
88
+ continue
89
+
90
+ for stop in stop_labels:
91
+ if re.match(rf"^{re.escape(stop)}\b", node_txt, flags=re.I):
92
+ return _normalize(" ".join(chunks)) or None
93
+
94
+ if node.name in {"p", "li"}:
95
+ chunks.append(node_txt)
96
+ elif node.name in {"div", "span"} and len(node_txt) > 40:
97
+ chunks.append(node_txt)
98
+
99
+ return _normalize(" ".join(chunks)) or None
100
+
101
+
102
+ def _extract_segment(text: str, label: str, stop_labels: Iterable[str]) -> Optional[str]:
103
+ """Extract substring after a label up to the next stop label in raw text."""
104
+ text_norm = _normalize(text)
105
+ lower = text_norm.lower()
106
+ label_l = label.lower()
107
+ start = lower.find(label_l)
108
+ if start == -1:
109
+ return None
110
+ start = start + len(label_l)
111
+ while start < len(text_norm) and text_norm[start] in " :":
112
+ start += 1
113
+ stop_pos = len(text_norm)
114
+ for stop in stop_labels:
115
+ pos = lower.find(stop, start)
116
+ if pos != -1 and pos < stop_pos:
117
+ stop_pos = pos
118
+ segment = text_norm[start:stop_pos].strip(" :-")
119
+ return segment or None
120
+
121
+
122
+ def _extract_kv_value(soup: BeautifulSoup, label: str) -> Optional[str]:
123
+ node = _find_label_node(soup, label)
124
+ if not node:
125
+ return None
126
+
127
+ txt = _normalize(node.get_text(" ", strip=True))
128
+ m = re.match(rf"^{re.escape(label)}\s*:\s*(.+)$", txt, flags=re.I)
129
+ if m:
130
+ return m.group(1).strip() or None
131
+
132
+ remainder = re.sub(rf"^{re.escape(label)}\s*", "", txt, flags=re.I).strip(" :-")
133
+ if remainder and remainder.lower() != label.lower():
134
+ return remainder
135
+
136
+ for sib in node.next_siblings:
137
+ if isinstance(sib, Tag):
138
+ v = _normalize(sib.get_text(" ", strip=True))
139
+ if v:
140
+ return v
141
+
142
+ parent = node.parent if isinstance(node.parent, Tag) else None
143
+ if parent:
144
+ parent_txt = _normalize(parent.get_text(" ", strip=True))
145
+ parent_remainder = re.sub(rf"\b{re.escape(label)}\b", "", parent_txt, flags=re.I).strip(" :-")
146
+ if parent_remainder:
147
+ return parent_remainder
148
+ for sib in parent.find_next_siblings():
149
+ v = _normalize(sib.get_text(" ", strip=True))
150
+ if v:
151
+ return v
152
+
153
+ return None
154
+
155
+
156
+ def _extract_duration_minutes(soup: BeautifulSoup) -> Optional[int]:
157
+ text = _normalize(soup.get_text(" ", strip=True))
158
+ patterns = [
159
+ r"minutes?\s*=\s*(\d+)",
160
+ r"(\d+)\s*(?:minute|min)\b",
161
+ r"completion time.*?(\d+)\s*(?:minute|min)\b",
162
+ ]
163
+ for pat in patterns:
164
+ m = re.search(pat, text, flags=re.I)
165
+ if m:
166
+ try:
167
+ return int(m.group(1))
168
+ except Exception:
169
+ continue
170
+ return None
171
+
172
+
173
+ def _extract_test_type_from_meta(soup: BeautifulSoup) -> Optional[str]:
174
+ label = _find_label_node(soup, "Test Type")
175
+ scope = label.parent if label and isinstance(label.parent, Tag) else label or soup
176
+
177
+ tokens: List[str] = []
178
+ for el in scope.find_all(["span", "button", "a"], limit=30):
179
+ t = _normalize(el.get_text("", strip=True))
180
+ if len(t) == 1 and t in ALLOWED_TEST_TYPES:
181
+ tokens.append(t)
182
+ if not tokens:
183
+ for el in label.find_all_next(["span", "button", "a"], limit=30) if label else []:
184
+ t = _normalize(el.get_text("", strip=True))
185
+ if len(t) == 1 and t in ALLOWED_TEST_TYPES:
186
+ tokens.append(t)
187
+ if not tokens:
188
+ return None
189
+ out = []
190
+ seen = set()
191
+ for t in tokens:
192
+ if t not in seen:
193
+ seen.add(t)
194
+ out.append(t)
195
+ return ",".join(out)
196
+
197
+
198
+ def _map_test_types_full(test_type: Optional[str]) -> Optional[str]:
199
+ if not test_type:
200
+ return None
201
+ parts = []
202
+ for token in test_type.split(","):
203
+ token = token.strip()
204
+ if not token:
205
+ continue
206
+ full = TEST_TYPE_LABELS.get(token)
207
+ if full:
208
+ parts.append(full)
209
+ return ", ".join(parts) if parts else None
210
+
211
+
212
+ def _split_list(value: Optional[str]) -> Optional[list[str]]:
213
+ if not value:
214
+ return None
215
+ parts = [p.strip() for p in value.replace(";", ",").split(",") if p.strip()]
216
+ return parts or None
217
+
218
+
219
+ def _is_positive_indicator(node: Tag) -> bool:
220
+ if not node:
221
+ return False
222
+ attrs = " ".join(
223
+ [
224
+ " ".join(node.get("class", [])) if isinstance(node.get("class"), list) else str(node.get("class") or ""),
225
+ str(node.get("aria-label") or ""),
226
+ str(node.get("title") or ""),
227
+ str(node.get("style") or ""),
228
+ ]
229
+ ).lower()
230
+ positive_tokens = ["green", "yes", "true", "available", "supported", "active", "enabled", "tick", "check", "on"]
231
+ return any(tok in attrs for tok in positive_tokens)
232
+
233
+
234
+ def _extract_boolean_from_meta(soup: BeautifulSoup, label_text: str) -> Optional[bool]:
235
+ label = _find_label_node(soup, label_text)
236
+ if not label:
237
+ return None
238
+
239
+ container = label.parent if isinstance(label.parent, Tag) else label
240
+ for el in container.find_all(["span", "i", "svg", "img"], limit=20):
241
+ if _is_positive_indicator(el):
242
+ return True
243
+
244
+ for el in label.find_all_next(["span", "i", "svg", "img"], limit=20):
245
+ if _is_positive_indicator(el):
246
+ return True
247
+
248
+ return False
249
+
250
+
251
+ def extract_detail_fields(html: str) -> Dict:
252
+ soup = BeautifulSoup(html, "lxml")
253
+
254
+ title = _extract_text(soup, "h1") or _extract_text(soup, "title")
255
+ full_text = _normalize(soup.get_text(" ", strip=True))
256
+ description = _extract_segment(full_text, "description", STOP_LABELS_LOWER)
257
+ if not description:
258
+ description = _extract_section_until(soup, "Description", STOP_LABELS)
259
+
260
+ job_levels_raw = _extract_kv_value(soup, "Job levels") or _extract_segment(full_text, "job levels", STOP_LABELS_LOWER)
261
+ job_levels = _split_list(job_levels_raw)
262
+ languages_raw = _extract_kv_value(soup, "Languages") or _extract_segment(full_text, "languages", STOP_LABELS_LOWER)
263
+ languages = _split_list(languages_raw)
264
+
265
+ duration = _extract_duration_minutes(soup)
266
+ if duration is None:
267
+ segment = _extract_segment(full_text, "assessment length", STOP_LABELS_LOWER)
268
+ if segment:
269
+ match = re.search(r"(\d+)\s*(?:minute|min)", segment, flags=re.I)
270
+ if match:
271
+ try:
272
+ duration = int(match.group(1))
273
+ except Exception:
274
+ duration = None
275
+
276
+ test_type = _extract_test_type_from_meta(soup)
277
+ test_type_full = _map_test_types_full(test_type)
278
+
279
+ remote_support = _extract_boolean_from_meta(soup, "Remote Testing")
280
+ adaptive_support = _extract_boolean_from_meta(soup, "Adaptive/IRT")
281
+ if adaptive_support is None:
282
+ adaptive_support = _extract_boolean_from_meta(soup, "Adaptive")
283
+ if adaptive_support is None:
284
+ adaptive_support = _extract_boolean_from_meta(soup, "Adaptive/IRT Testing")
285
+
286
+ downloads = []
287
+ downloads_label = _find_label_node(soup, "Downloads")
288
+ scope = downloads_label.parent if downloads_label and isinstance(downloads_label.parent, Tag) else soup
289
+ for link in scope.find_all("a", href=True):
290
+ text = _normalize(link.get_text(" ", strip=True))
291
+ href = link["href"]
292
+ if text and any(keyword in text.lower() for keyword in ["report", "fact sheet", "sample", "pdf", "download", "brochure"]):
293
+ downloads.append({"text": text, "url": href})
294
+
295
+ return {
296
+ "name": title,
297
+ "description": description,
298
+ "test_type": test_type,
299
+ "test_type_full": test_type_full,
300
+ "remote_support": remote_support,
301
+ "adaptive_support": adaptive_support,
302
+ "duration_minutes": duration,
303
+ "job_levels": job_levels,
304
+ "languages": languages,
305
+ "downloads": downloads or None,
306
+ }
307
+
308
+
309
+ def parse_detail_page(html: str, url: str, storage: Storage) -> Dict:
310
+ fields = extract_detail_fields(html)
311
+ storage.upsert_assessment(
312
+ {
313
+ "url": canonicalize_url(url),
314
+ **fields,
315
+ "last_updated_at": now_iso(),
316
+ }
317
+ )
318
+ storage.update_parse_status(url, PARSE_PARSED)
319
+ logger.info("detail.parse.success", url=url, name=fields.get("name"))
320
+ return fields
crawler/qa_checks.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Any, Dict, Optional
7
+
8
+ import pandas as pd
9
+
10
+
11
+ def load_catalog(path: str) -> pd.DataFrame:
12
+ p = Path(path)
13
+ if not p.exists():
14
+ raise FileNotFoundError(f"Catalog file not found: {path}")
15
+ if p.suffix == ".jsonl":
16
+ return pd.read_json(path, lines=True)
17
+ if p.suffix in {".parquet", ".pq"}:
18
+ return pd.read_parquet(path)
19
+ raise ValueError(f"Unsupported catalog format: {path}")
20
+
21
+
22
+ def qa_checks(df: pd.DataFrame) -> Dict[str, Any]:
23
+ total = len(df)
24
+
25
+ def pct_missing(col: str) -> float:
26
+ return float(df[col].isna().mean()) * 100.0 if col in df else 100.0
27
+
28
+ bool_sanity = {}
29
+ for col in ["remote_support", "adaptive_support"]:
30
+ if col in df:
31
+ bool_sanity[col] = bool(
32
+ df[col].dropna().apply(lambda x: isinstance(x, (bool, int))).all()
33
+ )
34
+ else:
35
+ bool_sanity[col] = False
36
+
37
+ description_lengths = df["description"].dropna().apply(lambda x: len(str(x))) if "description" in df else pd.Series(dtype=int)
38
+ min_desc_len: Optional[int] = int(description_lengths.min()) if not description_lengths.empty else None
39
+
40
+ return {
41
+ "total": total,
42
+ "count_gate": total >= 377,
43
+ "missing_pct": {
44
+ "description": pct_missing("description"),
45
+ "test_type": pct_missing("test_type"),
46
+ "remote_support": pct_missing("remote_support"),
47
+ "adaptive_support": pct_missing("adaptive_support"),
48
+ "duration_minutes": pct_missing("duration") if "duration" in df else pct_missing("duration_minutes"),
49
+ },
50
+ "url_uniqueness": {
51
+ "unique_urls": int(df["url"].nunique()) if "url" in df else 0,
52
+ "matches_row_count": bool("url" in df and df["url"].nunique() == total),
53
+ },
54
+ "description_quality": {
55
+ "min_length": min_desc_len,
56
+ "passed_min_30": bool(min_desc_len is not None and min_desc_len >= 30),
57
+ },
58
+ "test_type_distribution": df["test_type"].value_counts(dropna=False).to_dict() if "test_type" in df else {},
59
+ "boolean_sanity": bool_sanity,
60
+ }
61
+
62
+
63
+ def main() -> None:
64
+ if len(sys.argv) < 2:
65
+ print("Usage: python qa_checks.py <catalog.jsonl|catalog.parquet>")
66
+ sys.exit(1)
67
+ path = sys.argv[1]
68
+ df = load_catalog(path)
69
+ results = qa_checks(df)
70
+ print(json.dumps(results, indent=2))
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
crawler/robots.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import urllib.robotparser
5
+ from dataclasses import dataclass
6
+
7
+ import structlog
8
+
9
+
10
+ logger = structlog.get_logger(__name__)
11
+
12
+
13
+ @dataclass
14
+ class RobotsManager:
15
+ robots_url: str
16
+ user_agent: str
17
+
18
+ def __post_init__(self) -> None:
19
+ self._parser = urllib.robotparser.RobotFileParser()
20
+
21
+ def load(self) -> None:
22
+ logger.info("robots.load.start", robots_url=self.robots_url)
23
+ self._parser.set_url(self.robots_url)
24
+ try:
25
+ self._parser.read()
26
+ logger.info("robots.load.success", can_fetch_all=self._parser.can_fetch(self.user_agent, "*"))
27
+ except Exception as exc: # pragma: no cover - network errors are logged
28
+ logger.warning("robots.load.failed", error=str(exc))
29
+
30
+ def is_allowed(self, url: str) -> bool:
31
+ try:
32
+ return self._parser.can_fetch(self.user_agent, url)
33
+ except Exception as exc: # pragma: no cover
34
+ logger.warning("robots.check.error", url=url, error=str(exc))
35
+ return False
crawler/run.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import os
6
+ from typing import Optional
7
+
8
+ import logging
9
+ import structlog
10
+
11
+ from config import load_config
12
+ from crawler.export import export_catalog
13
+ from crawler.fetcher import PlaywrightFetcher
14
+ from crawler.parser_catalog import parse_catalog_page
15
+ from crawler.parser_detail import parse_detail_page
16
+ from crawler.robots import RobotsManager
17
+ from crawler.storage import (
18
+ PAGE_TYPE_CATALOG,
19
+ PAGE_TYPE_DETAIL,
20
+ PARSE_PENDING,
21
+ Storage,
22
+ )
23
+ from crawler.utils import RateLimiter
24
+
25
+ logger = structlog.get_logger(__name__)
26
+
27
+
28
+ def configure_logging(log_level: str = "INFO") -> None:
29
+ logging.basicConfig(level=getattr(logging, log_level.upper(), logging.INFO))
30
+ structlog.configure(
31
+ wrapper_class=structlog.make_filtering_bound_logger(getattr(logging, log_level.upper(), logging.INFO)),
32
+ processors=[
33
+ structlog.processors.add_log_level,
34
+ structlog.processors.TimeStamper(fmt="iso"),
35
+ structlog.processors.JSONRenderer(),
36
+ ],
37
+ )
38
+
39
+
40
+ async def crawl_catalog(
41
+ start_url: str,
42
+ storage: Storage,
43
+ fetcher: PlaywrightFetcher,
44
+ robots: RobotsManager,
45
+ max_discover: int | None = None,
46
+ allow_robots_bypass: bool = False,
47
+ ) -> None:
48
+ frontier = [start_url]
49
+ seen = set()
50
+ total_discovered = 0
51
+
52
+ while frontier:
53
+ url = frontier.pop(0)
54
+ if url in seen:
55
+ continue
56
+ seen.add(url)
57
+ allowed = allow_robots_bypass or robots.is_allowed(url)
58
+ if not allowed:
59
+ logger.warning("catalog.fetch.disallowed", url=url)
60
+ continue
61
+ if allow_robots_bypass:
62
+ logger.warning("catalog.fetch.disallowed.bypassed", url=url)
63
+ result = await fetcher.fetch(url, page_type=PAGE_TYPE_CATALOG)
64
+ storage.upsert_page(result.record)
65
+ if result.error or not result.html:
66
+ logger.error("catalog.fetch.failed", url=url, error=result.error)
67
+ continue
68
+ _, discovered_urls, next_pages = parse_catalog_page(result.html, source_url=url, storage=storage)
69
+ total_discovered += len(discovered_urls)
70
+ for next_url in next_pages:
71
+ if next_url not in seen:
72
+ frontier.append(next_url)
73
+ if max_discover and total_discovered >= max_discover:
74
+ logger.info("catalog.max_discover.reached", total=total_discovered, max=max_discover)
75
+ break
76
+
77
+
78
+ async def crawl_details(
79
+ storage: Storage,
80
+ fetcher: PlaywrightFetcher,
81
+ robots: RobotsManager,
82
+ allow_robots_bypass: bool = False,
83
+ ) -> None:
84
+ pending = storage.get_pages_by_type(PAGE_TYPE_DETAIL, parse_status=PARSE_PENDING)
85
+ logger.info("detail.queue", pending=len(pending))
86
+ for page in pending:
87
+ url = page["url"]
88
+ allowed = allow_robots_bypass or robots.is_allowed(url)
89
+ if not allowed:
90
+ logger.warning("detail.fetch.disallowed", url=url)
91
+ continue
92
+ if allow_robots_bypass:
93
+ logger.warning("detail.fetch.disallowed.bypassed", url=url)
94
+ result = await fetcher.fetch(url, page_type=PAGE_TYPE_DETAIL)
95
+ storage.upsert_page(result.record)
96
+ if result.error or not result.html:
97
+ logger.error("detail.fetch.failed", url=url, error=result.error)
98
+ continue
99
+ parse_detail_page(result.html, url=url, storage=storage)
100
+
101
+
102
+ def main(argv: Optional[list[str]] = None) -> None:
103
+ parser = argparse.ArgumentParser(description="Crawler pipeline")
104
+ parser.add_argument("--mode", choices=["crawl_all", "discover", "details", "export"], default="crawl_all")
105
+ parser.add_argument("--config", type=str, default=os.environ.get("CONFIG_PATH", "configs/config.yaml"))
106
+ parser.add_argument("--parquet", type=str, default="data/catalog.parquet")
107
+ parser.add_argument("--jsonl", type=str, default="data/catalog.jsonl")
108
+ parser.add_argument(
109
+ "--max-discover",
110
+ type=int,
111
+ default=None,
112
+ help="Limit number of detail URLs discovered (for smoke tests)",
113
+ )
114
+ parser.add_argument(
115
+ "--limit-export",
116
+ type=int,
117
+ default=None,
118
+ help="Limit number of rows exported (for smoke tests)",
119
+ )
120
+ parser.add_argument(
121
+ "--allow-robots-bypass",
122
+ action="store_true",
123
+ help="Bypass robots.txt disallow (for testing; use responsibly)",
124
+ )
125
+ args = parser.parse_args(argv)
126
+
127
+ config = load_config(args.config)
128
+ configure_logging(config.get("app", {}).get("log_level", "INFO"))
129
+ crawler_cfg = config.get("crawler", {})
130
+ rate_limiter = RateLimiter(
131
+ base_delay=float(os.environ.get("REQUEST_DELAY_SECONDS", crawler_cfg.get("request_delay_seconds", 1.5))),
132
+ jitter=float(os.environ.get("JITTER_SECONDS", crawler_cfg.get("jitter_seconds", 0.5))),
133
+ )
134
+ user_agent = os.environ.get("USER_AGENT", crawler_cfg.get("user_agent"))
135
+ start_url = os.environ.get("START_URL", crawler_cfg.get("start_url"))
136
+ max_retries = int(os.environ.get("MAX_RETRIES", crawler_cfg.get("max_retries", 3)))
137
+ sqlite_path = crawler_cfg.get("sqlite_path", "data/crawler.db")
138
+ allow_bypass = args.allow_robots_bypass or os.environ.get("ALLOW_ROBOTS_BYPASS", "").lower() in {"1", "true", "yes"}
139
+
140
+ storage = Storage(sqlite_path)
141
+ robots = RobotsManager(robots_url="https://www.shl.com/robots.txt", user_agent=user_agent)
142
+ robots.load()
143
+
144
+ async def _runner():
145
+ async with PlaywrightFetcher(user_agent=user_agent, rate_limiter=rate_limiter, max_retries=max_retries) as fetcher:
146
+ if args.mode in {"crawl_all", "discover"}:
147
+ await crawl_catalog(start_url, storage, fetcher, robots, max_discover=args.max_discover, allow_robots_bypass=allow_bypass)
148
+ if args.mode in {"crawl_all", "details"}:
149
+ await crawl_details(storage, fetcher, robots, allow_robots_bypass=allow_bypass)
150
+
151
+ if args.mode in {"crawl_all", "discover", "details"}:
152
+ asyncio.run(_runner())
153
+
154
+ if args.mode == "export":
155
+ export_catalog(
156
+ storage,
157
+ parquet_path=args.parquet,
158
+ jsonl_path=args.jsonl,
159
+ limit=args.limit_export,
160
+ min_count=1 if args.limit_export else 377,
161
+ )
162
+
163
+
164
+ if __name__ == "__main__":
165
+ main()
crawler/storage.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import sqlite3
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Iterable, List, Optional
9
+
10
+ import structlog
11
+
12
+ from crawler.utils import canonicalize_url, make_assessment_id, now_iso
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+
17
+ PAGE_TYPE_CATALOG = "CATALOG"
18
+ PAGE_TYPE_DETAIL = "DETAIL"
19
+
20
+ PARSE_PENDING = "PENDING"
21
+ PARSE_PARSED = "PARSED"
22
+ PARSE_FAILED = "FAILED"
23
+
24
+
25
+ @dataclass
26
+ class PageRecord:
27
+ url: str
28
+ page_type: str
29
+ http_status: Optional[int] = None
30
+ html: Optional[str] = None
31
+ error: Optional[str] = None
32
+ retry_count: int = 0
33
+ parse_status: str = PARSE_PENDING
34
+
35
+
36
+ class Storage:
37
+ def __init__(self, db_path: str) -> None:
38
+ self.db_path = db_path
39
+ Path(db_path).parent.mkdir(parents=True, exist_ok=True)
40
+ self.conn = sqlite3.connect(self.db_path)
41
+ self.conn.row_factory = sqlite3.Row
42
+ self.ensure_schema()
43
+
44
+ def ensure_schema(self) -> None:
45
+ logger.info("storage.schema.ensure", db_path=self.db_path)
46
+ cur = self.conn.cursor()
47
+ cur.execute(
48
+ """
49
+ CREATE TABLE IF NOT EXISTS pages (
50
+ url TEXT PRIMARY KEY,
51
+ url_canonical TEXT UNIQUE,
52
+ page_type TEXT,
53
+ http_status INTEGER,
54
+ fetched_at TEXT,
55
+ html TEXT,
56
+ error TEXT,
57
+ retry_count INTEGER DEFAULT 0,
58
+ parse_status TEXT DEFAULT 'PENDING'
59
+ )
60
+ """
61
+ )
62
+ cur.execute(
63
+ """
64
+ CREATE TABLE IF NOT EXISTS assessments (
65
+ assessment_id TEXT PRIMARY KEY,
66
+ url TEXT UNIQUE,
67
+ name TEXT,
68
+ description TEXT,
69
+ test_type TEXT,
70
+ test_type_full TEXT,
71
+ remote_support INTEGER,
72
+ adaptive_support INTEGER,
73
+ duration_minutes INTEGER,
74
+ job_levels TEXT,
75
+ languages TEXT,
76
+ downloads TEXT,
77
+ source_catalog_page TEXT,
78
+ discovered_at TEXT,
79
+ last_updated_at TEXT
80
+ )
81
+ """
82
+ )
83
+ cur.execute(
84
+ """
85
+ CREATE TABLE IF NOT EXISTS crawl_meta (
86
+ run_id TEXT,
87
+ started_at TEXT,
88
+ finished_at TEXT,
89
+ total_catalog_pages INTEGER,
90
+ total_detail_pages INTEGER,
91
+ individual_assessment_count INTEGER,
92
+ notes TEXT
93
+ )
94
+ """
95
+ )
96
+ self.conn.commit()
97
+
98
+ def upsert_page(self, record: PageRecord) -> None:
99
+ canonical = canonicalize_url(record.url)
100
+ logger.debug("storage.page.upsert", url=record.url, page_type=record.page_type)
101
+ self.conn.execute(
102
+ """
103
+ INSERT INTO pages (url, url_canonical, page_type, http_status, fetched_at, html, error, retry_count, parse_status)
104
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
105
+ ON CONFLICT(url) DO UPDATE SET
106
+ page_type=excluded.page_type,
107
+ http_status=excluded.http_status,
108
+ fetched_at=excluded.fetched_at,
109
+ html=excluded.html,
110
+ error=excluded.error,
111
+ retry_count=excluded.retry_count,
112
+ parse_status=excluded.parse_status
113
+ """,
114
+ (
115
+ record.url,
116
+ canonical,
117
+ record.page_type,
118
+ record.http_status,
119
+ now_iso(),
120
+ record.html,
121
+ record.error,
122
+ record.retry_count,
123
+ record.parse_status,
124
+ ),
125
+ )
126
+ self.conn.commit()
127
+
128
+ def update_parse_status(self, url: str, status: str) -> None:
129
+ self.conn.execute("UPDATE pages SET parse_status=? WHERE url=?", (status, url))
130
+ self.conn.commit()
131
+
132
+ def get_pages_by_type(self, page_type: str, parse_status: Optional[str] = None) -> List[sqlite3.Row]:
133
+ cur = self.conn.cursor()
134
+ if parse_status:
135
+ cur.execute(
136
+ "SELECT * FROM pages WHERE page_type=? AND parse_status=? ORDER BY url", (page_type, parse_status)
137
+ )
138
+ else:
139
+ cur.execute("SELECT * FROM pages WHERE page_type=? ORDER BY url", (page_type,))
140
+ return cur.fetchall()
141
+
142
+ def upsert_assessment(self, data: Dict[str, Any]) -> None:
143
+ url = data["url"]
144
+ assessment_id = data.get("assessment_id") or make_assessment_id(url)
145
+ data = {**data, "assessment_id": assessment_id}
146
+ downloads = data.get("downloads")
147
+ if downloads is not None and not isinstance(downloads, str):
148
+ downloads = json.dumps(downloads)
149
+ job_levels = data.get("job_levels")
150
+ if isinstance(job_levels, (list, tuple)):
151
+ job_levels = json.dumps(job_levels)
152
+ languages = data.get("languages")
153
+ if isinstance(languages, (list, tuple)):
154
+ languages = json.dumps(languages)
155
+
156
+ logger.debug("storage.assessment.upsert", url=url)
157
+ self.conn.execute(
158
+ """
159
+ INSERT INTO assessments (
160
+ assessment_id, url, name, description, test_type, test_type_full, remote_support, adaptive_support,
161
+ duration_minutes, job_levels, languages, downloads, source_catalog_page, discovered_at, last_updated_at
162
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
163
+ ON CONFLICT(assessment_id) DO UPDATE SET
164
+ url=excluded.url,
165
+ name=COALESCE(excluded.name, assessments.name),
166
+ description=COALESCE(excluded.description, assessments.description),
167
+ test_type=COALESCE(excluded.test_type, assessments.test_type),
168
+ test_type_full=COALESCE(excluded.test_type_full, assessments.test_type_full),
169
+ remote_support=COALESCE(excluded.remote_support, assessments.remote_support),
170
+ adaptive_support=COALESCE(excluded.adaptive_support, assessments.adaptive_support),
171
+ duration_minutes=COALESCE(excluded.duration_minutes, assessments.duration_minutes),
172
+ job_levels=COALESCE(excluded.job_levels, assessments.job_levels),
173
+ languages=COALESCE(excluded.languages, assessments.languages),
174
+ downloads=COALESCE(excluded.downloads, assessments.downloads),
175
+ source_catalog_page=COALESCE(excluded.source_catalog_page, assessments.source_catalog_page),
176
+ last_updated_at=excluded.last_updated_at
177
+ """,
178
+ (
179
+ data["assessment_id"],
180
+ url,
181
+ data.get("name"),
182
+ data.get("description"),
183
+ data.get("test_type"),
184
+ data.get("test_type_full"),
185
+ data.get("remote_support"),
186
+ data.get("adaptive_support"),
187
+ data.get("duration_minutes"),
188
+ job_levels,
189
+ languages,
190
+ downloads,
191
+ data.get("source_catalog_page"),
192
+ data.get("discovered_at") or now_iso(),
193
+ data.get("last_updated_at") or now_iso(),
194
+ ),
195
+ )
196
+ self.conn.commit()
197
+
198
+ def fetch_assessments(self) -> List[sqlite3.Row]:
199
+ cur = self.conn.cursor()
200
+ cur.execute("SELECT * FROM assessments ORDER BY name")
201
+ return cur.fetchall()
202
+
203
+ def count_assessments(self) -> int:
204
+ cur = self.conn.cursor()
205
+ cur.execute("SELECT COUNT(*) FROM assessments")
206
+ return cur.fetchone()[0]
207
+
208
+ def close(self) -> None:
209
+ self.conn.close()
crawler/utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import random
5
+ import time
6
+ import urllib.parse
7
+ from datetime import datetime, timezone
8
+ from typing import Iterable
9
+
10
+
11
+ def canonicalize_url(url: str) -> str:
12
+ """Normalize URL by stripping fragments/query trackers and trailing slashes."""
13
+ parsed = urllib.parse.urlparse(url)
14
+ query = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
15
+ filtered_query = [(k, v) for k, v in query if not k.lower().startswith("utm_")]
16
+ cleaned_query = urllib.parse.urlencode(filtered_query, doseq=True)
17
+ path = parsed.path if parsed.path != "/" else ""
18
+ # Keep trailing slash for non-root paths to avoid 404s on detail pages.
19
+ if path and not path.endswith("/"):
20
+ path = path
21
+ normalized = parsed._replace(query=cleaned_query, fragment="", path=path).geturl()
22
+ return normalized or url
23
+
24
+
25
+ def make_assessment_id(url: str) -> str:
26
+ """Deterministic ID from canonical URL."""
27
+ canonical = canonicalize_url(url)
28
+ return hashlib.sha1(canonical.encode("utf-8")).hexdigest()
29
+
30
+
31
+ def now_iso() -> str:
32
+ return datetime.now(timezone.utc).isoformat()
33
+
34
+
35
+ class RateLimiter:
36
+ """Coarse rate limiter with jitter to respect polite crawling."""
37
+
38
+ def __init__(self, base_delay: float, jitter: float) -> None:
39
+ self.base_delay = base_delay
40
+ self.jitter = jitter
41
+ self._last_ts = 0.0
42
+
43
+ def sleep(self) -> None:
44
+ now = time.monotonic()
45
+ elapsed = now - self._last_ts
46
+ delay = self.base_delay + random.uniform(0, self.jitter)
47
+ if elapsed < delay:
48
+ time.sleep(delay - elapsed)
49
+ self._last_ts = time.monotonic()
50
+
51
+
52
+ def batched(iterable: Iterable, size: int):
53
+ """Yield fixed-size batches from an iterable."""
54
+ batch = []
55
+ for item in iterable:
56
+ batch.append(item)
57
+ if len(batch) == size:
58
+ yield batch
59
+ batch = []
60
+ if batch:
61
+ yield batch
docker-compose.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+
3
+ services:
4
+ api:
5
+ build: .
6
+ ports:
7
+ - "8000:8000"
8
+ environment:
9
+ LLM_MODEL: Qwen/Qwen2.5-1.5B-Instruct
10
+ HF_HOME: /cache/hf
11
+ volumes:
12
+ - ./data:/app/data:ro
13
+ - ./models:/app/models:ro
14
+ - hf-cache:/cache/hf
15
+
16
+ web:
17
+ build: ./frontend
18
+ ports:
19
+ - "3000:3000"
20
+ environment:
21
+ NEXT_PUBLIC_API_BASE: http://api:8000
22
+ depends_on:
23
+ - api
24
+
25
+ volumes:
26
+ hf-cache:
embeddings/generator.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from pathlib import Path
6
+ from typing import List, Tuple
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from tqdm import tqdm
11
+
12
+ from data.catalog_loader import make_assessment_id
13
+ from models.embedding_model import EmbeddingModel
14
+
15
+
16
+ def generate_embeddings(catalog_path: str, model_name: str, batch_size: int = 32, output_dir: str = "data/embeddings") -> Tuple[np.ndarray, List[str]]:
17
+ df = pd.read_json(catalog_path, lines=True) if catalog_path.endswith(".jsonl") else pd.read_parquet(catalog_path)
18
+ if "assessment_id" not in df.columns:
19
+ if "url" in df.columns:
20
+ df["assessment_id"] = df["url"].apply(make_assessment_id)
21
+ else:
22
+ raise KeyError("assessment_id not found and url missing to derive it.")
23
+ df = df.sort_values("assessment_id")
24
+ texts = df["doc_text"].tolist()
25
+ ids = df["assessment_id"].tolist()
26
+
27
+ model = EmbeddingModel(model_name)
28
+ embeddings: List[np.ndarray] = []
29
+ start = time.time()
30
+ for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
31
+ batch = texts[i : i + batch_size]
32
+ embeds = model.encode(batch, normalize=True, batch_size=batch_size, is_query=False)
33
+ embeddings.append(embeds)
34
+ embeddings_arr = np.vstack(embeddings).astype(np.float32)
35
+
36
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
37
+ np.save(Path(output_dir) / "embeddings.npy", embeddings_arr)
38
+ with open(Path(output_dir) / "assessment_ids.json", "w") as f:
39
+ json.dump(ids, f, indent=2)
40
+
41
+ total_time = time.time() - start
42
+ log = {
43
+ "generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
44
+ "model_name": model_name,
45
+ "num_documents": len(texts),
46
+ "embedding_dim": embeddings_arr.shape[1],
47
+ "batch_size": batch_size,
48
+ "total_time_seconds": total_time,
49
+ "avg_time_per_doc_ms": (total_time / len(texts) * 1000) if len(texts) else None,
50
+ "normalized": True,
51
+ "catalog_path": catalog_path,
52
+ }
53
+ with open(Path(output_dir) / "generation_log.json", "w") as f:
54
+ json.dump(log, f, indent=2)
55
+ return embeddings_arr, ids
56
+
57
+
58
+ if __name__ == "__main__":
59
+ import argparse
60
+
61
+ parser = argparse.ArgumentParser()
62
+ parser.add_argument("--catalog", required=True, help="Enriched catalog with doc_text")
63
+ parser.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
64
+ parser.add_argument("--batch-size", type=int, default=32)
65
+ parser.add_argument("--output-dir", default="data/embeddings")
66
+ args = parser.parse_args()
67
+
68
+ generate_embeddings(args.catalog, args.model, batch_size=args.batch_size, output_dir=args.output_dir)
eval/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Evaluation datasets, metrics, and experiments."""
eval/compare_runs.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+
8
+ def load_metrics(path: str) -> dict:
9
+ with open(path) as f:
10
+ return json.load(f)
11
+
12
+
13
+ def compare(run_a: str, run_b: str) -> dict:
14
+ m_a = load_metrics(Path(run_a) / "metrics.json")
15
+ m_b = load_metrics(Path(run_b) / "metrics.json")
16
+ def extract(m):
17
+ return {
18
+ "train_r10": m["train"]["recall@10"],
19
+ "val_r10": m["val"]["recall@10"],
20
+ "val_mrr10": m["val"]["mrr@10"],
21
+ }
22
+ return {"run_a": run_a, "run_b": run_b, "metrics_a": extract(m_a), "metrics_b": extract(m_b)}
23
+
24
+
25
+ def main():
26
+ if len(sys.argv) != 3:
27
+ print("Usage: python -m eval.compare_runs <run_dir_a> <run_dir_b>")
28
+ sys.exit(1)
29
+ result = compare(sys.argv[1], sys.argv[2])
30
+ print(json.dumps(result, indent=2))
31
+
32
+
33
+ if __name__ == "__main__":
34
+ main()
eval/diagnostic_topk.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ from data.catalog_loader import load_catalog
10
+ from data.train_loader import load_train
11
+ from recommenders.bm25 import BM25Recommender
12
+ from recommenders.vector_recommender import VectorRecommender
13
+ from recommenders.hybrid_rrf import HybridRRFRecommender, HybridRerankRecommender
14
+ from retrieval.vector_index import VectorIndex
15
+ from models.embedding_model import EmbeddingModel
16
+ from rerankers.cross_encoder import CrossEncoderReranker
17
+
18
+
19
+ def main():
20
+ parser = argparse.ArgumentParser(description="Diagnostics: positives coverage in top-N candidates and top-10 rerank.")
21
+ parser.add_argument("--catalog", default="data/catalog_docs.jsonl")
22
+ parser.add_argument("--train", required=True)
23
+ parser.add_argument("--vector-index", required=True)
24
+ parser.add_argument("--assessment-ids", required=True)
25
+ parser.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
26
+ parser.add_argument("--reranker-model", default="cross-encoder/ms-marco-MiniLM-L-6-v2")
27
+ parser.add_argument("--topn", type=int, default=200, help="Top-N candidates to inspect")
28
+ parser.add_argument("--rrf-k", type=int, default=60)
29
+ parser.add_argument("--output-dir", default="runs/diagnostic_topk")
30
+ args = parser.parse_args()
31
+
32
+ df_catalog, _, id_by_url = load_catalog(args.catalog)
33
+ with open(args.assessment_ids) as f:
34
+ ids = json.load(f)
35
+ index = VectorIndex.load(args.vector_index)
36
+ embed_model = EmbeddingModel(args.model)
37
+ vector_rec = VectorRecommender(embed_model, index, df_catalog, ids, k_candidates=args.topn)
38
+ bm25_rec = BM25Recommender(df_catalog)
39
+ hybrid = HybridRRFRecommender(bm25_rec, vector_rec, topn_candidates=args.topn, rrf_k=args.rrf_k)
40
+ reranker = CrossEncoderReranker(model_name=args.reranker_model)
41
+ hybrid_rerank = HybridRerankRecommender(bm25_rec, vector_rec, reranker, df_catalog, topn_candidates=args.topn, rrf_k=args.rrf_k)
42
+
43
+ examples, label_report = load_train(args.train, id_by_url)
44
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
45
+ Path(args.output_dir, "label_resolution_report.json").write_text(json.dumps(label_report, indent=2))
46
+
47
+ rows = []
48
+ coverage_fail = 0
49
+ zero_topn = 0
50
+ zero_top10 = 0
51
+ for ex in examples:
52
+ candidates = hybrid.recommend(ex.query, k=args.topn)
53
+ reranked = hybrid_rerank.recommend(ex.query, k=10)
54
+ pos_topn = len(set(candidates).intersection(ex.relevant_ids))
55
+ pos_top10 = len(set(reranked).intersection(ex.relevant_ids))
56
+ if pos_topn == 0:
57
+ zero_topn += 1
58
+ if pos_top10 == 0:
59
+ zero_top10 += 1
60
+ if pos_topn == 0:
61
+ coverage_fail += 1
62
+ rows.append(
63
+ {
64
+ "query": ex.query,
65
+ "relevant_ids": list(ex.relevant_ids),
66
+ "pos_in_topn": pos_topn,
67
+ "pos_in_top10": pos_top10,
68
+ "candidates": candidates,
69
+ "reranked_top10": reranked,
70
+ }
71
+ )
72
+
73
+ summary = {
74
+ "total_queries": len(examples),
75
+ "topn": args.topn,
76
+ "zero_pos_in_topn": zero_topn,
77
+ "zero_pos_in_top10": zero_top10,
78
+ "coverage_failures": coverage_fail,
79
+ "label_match_pct": label_report.get("matched_pct"),
80
+ }
81
+ with open(Path(args.output_dir) / "summary.json", "w") as f:
82
+ json.dump(summary, f, indent=2)
83
+ pd.DataFrame(rows).to_json(Path(args.output_dir) / "per_query.jsonl", orient="records", lines=True)
84
+ print(json.dumps(summary, indent=2))
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
eval/metrics.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, List, Sequence, Set
4
+
5
+
6
+ def recall_at_k(ground_truth: Set[str], preds: Sequence[str], k: int) -> float:
7
+ if not ground_truth:
8
+ return 0.0
9
+ topk = preds[:k]
10
+ hits = len(ground_truth.intersection(topk))
11
+ return hits / len(ground_truth)
12
+
13
+
14
+ def mrr_at_k(ground_truth: Set[str], preds: Sequence[str], k: int) -> float:
15
+ if not ground_truth:
16
+ return 0.0
17
+ for idx, pid in enumerate(preds[:k], start=1):
18
+ if pid in ground_truth:
19
+ return 1.0 / idx
20
+ return 0.0
21
+
22
+
23
+ def mean_metric(queries: Iterable[Set[str]], preds_list: Iterable[Sequence[str]], fn, k: int) -> float:
24
+ scores = []
25
+ for g, p in zip(queries, preds_list):
26
+ scores.append(fn(g, p, k))
27
+ return sum(scores) / len(scores) if scores else 0.0
eval/run_eval.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Dict, List
9
+
10
+ import pandas as pd
11
+
12
+ from data.catalog_loader import load_catalog
13
+ from data.train_loader import load_train, save_label_resolution_report
14
+ from eval.metrics import recall_at_k, mrr_at_k
15
+ from recommenders.dummy_random import DummyRandomRecommender
16
+ from recommenders.bm25 import BM25Recommender
17
+ from recommenders.vector_recommender import VectorRecommender
18
+ from recommenders.hybrid_rrf import HybridRRFRecommender, HybridRerankRecommender
19
+ from recommenders.hybrid_rrf_lgbm import HybridRRFLGBMRecommender
20
+ from retrieval.vector_index import VectorIndex
21
+ from models.embedding_model import EmbeddingModel
22
+ from rerankers.cross_encoder import CrossEncoderReranker
23
+ from rerankers.lgbm_reranker import LGBMReranker
24
+ from retrieval.query_rewriter import rewrite_query
25
+
26
+
27
+ def split_examples(examples, val_ratio=0.2, seed=42):
28
+ import random
29
+
30
+ rnd = random.Random(seed)
31
+ shuffled = examples[:]
32
+ rnd.shuffle(shuffled)
33
+ cut = int(len(shuffled) * (1 - val_ratio))
34
+ return shuffled[:cut], shuffled[cut:]
35
+
36
+
37
+ def run_eval(catalog_path: str, train_path: str, recommender_name: str, out_dir: str, seed: int = 42):
38
+ df_catalog, catalog_by_id, id_by_url = load_catalog(catalog_path)
39
+ examples, label_report = load_train(train_path, id_by_url)
40
+ save_label_resolution_report(label_report, Path(out_dir) / "label_resolution_report.json")
41
+
42
+ train_split, val_split = split_examples(examples, val_ratio=0.2, seed=seed)
43
+
44
+ def make_recommender():
45
+ if recommender_name == "dummy_random":
46
+ return DummyRandomRecommender(df_catalog["assessment_id"].tolist(), seed=seed)
47
+ if recommender_name == "bm25":
48
+ return BM25Recommender(df_catalog)
49
+ if recommender_name == "vector":
50
+ # Expect doc_text present in df_catalog and provided index/ids/model via env/args; set below in main()
51
+ raise RuntimeError("Vector recommender should be constructed in main with index and ids.")
52
+ raise ValueError(f"Unknown recommender: {recommender_name}")
53
+
54
+ recommender = make_recommender()
55
+
56
+ def eval_split(split, split_name):
57
+ preds_list: List[List[str]] = []
58
+ gt_list: List[set] = []
59
+ rows = []
60
+ for ex in split:
61
+ preds_raw = recommender.recommend(ex.query, k=10)
62
+ preds = []
63
+ for pr in preds_raw:
64
+ if isinstance(pr, str):
65
+ preds.append(pr)
66
+ elif isinstance(pr, dict) and "assessment_id" in pr:
67
+ preds.append(pr["assessment_id"])
68
+ preds = preds[:10]
69
+ preds_list.append(preds)
70
+ gt_list.append(ex.relevant_ids)
71
+ hits = len(set(preds).intersection(ex.relevant_ids))
72
+ rows.append(
73
+ {
74
+ "query": ex.query,
75
+ "relevant_ids": list(ex.relevant_ids),
76
+ "predicted_ids": preds,
77
+ "hits": hits,
78
+ }
79
+ )
80
+ recall10 = sum(recall_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
81
+ recall5 = sum(recall_at_k(g, p, 5) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
82
+ mrr10 = sum(mrr_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
83
+ return recall10, recall5, mrr10, rows
84
+
85
+ train_r10, train_r5, train_mrr10, train_rows = eval_split(train_split, "train")
86
+ val_r10, val_r5, val_mrr10, val_rows = eval_split(val_split, "val")
87
+
88
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
89
+ metrics = {
90
+ "recommender": recommender_name,
91
+ "label_match_pct": label_report.get("matched_pct"),
92
+ "train": {"recall@10": train_r10, "recall@5": train_r5, "mrr@10": train_mrr10, "n": len(train_split)},
93
+ "val": {"recall@10": val_r10, "recall@5": val_r5, "mrr@10": val_mrr10, "n": len(val_split)},
94
+ }
95
+ with open(Path(out_dir) / "metrics.json", "w") as f:
96
+ json.dump(metrics, f, indent=2)
97
+ pd.DataFrame(train_rows + val_rows).to_json(Path(out_dir) / "per_query_results.jsonl", orient="records", lines=True)
98
+ worst = sorted(val_rows, key=lambda r: r["hits"])[:10]
99
+ pd.DataFrame(worst).to_csv(Path(out_dir) / "worst_queries.csv", index=False)
100
+
101
+
102
+ def main():
103
+ parser = argparse.ArgumentParser()
104
+ parser.add_argument("--catalog", default="data/catalog.jsonl")
105
+ parser.add_argument("--train", required=True)
106
+ parser.add_argument("--recommender", default="dummy_random")
107
+ parser.add_argument("--out-dir", default=None)
108
+ parser.add_argument("--seed", type=int, default=42)
109
+ parser.add_argument("--vector-index", type=str, help="Path to FAISS index (for recommender=vector/hybrid_rrf)")
110
+ parser.add_argument("--assessment-ids", type=str, help="Path to assessment_ids.json aligned with embeddings/index")
111
+ parser.add_argument("--model", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Embedding model for vector recommender")
112
+ parser.add_argument("--topn-candidates", type=int, default=200, help="Top-N candidates to retrieve before fusion/rerank")
113
+ parser.add_argument("--rrf-k", type=int, default=60, help="RRF smoothing constant")
114
+ parser.add_argument("--reranker-model", type=str, default="cross-encoder/ms-marco-MiniLM-L-6-v2", help="Cross-encoder model for reranking")
115
+ parser.add_argument("--lgbm-model", type=str, help="Path to trained LGBM model (for hybrid_rrf_lgbm)")
116
+ parser.add_argument("--lgbm-features", type=str, help="Path to feature_schema.json for LGBM reranker")
117
+ parser.add_argument("--use-rewriter", action="store_true", help="Rewrite queries before retrieval/rerank.")
118
+ parser.add_argument("--vocab", type=str, help="Optional vocab JSON for rewriter boosts.")
119
+ args = parser.parse_args()
120
+
121
+ run_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
122
+ out_dir = args.out_dir or f"runs/{run_id}_{args.recommender}"
123
+ if args.recommender in {"vector", "hybrid_rrf", "hybrid_rrf_rerank", "hybrid_rrf_lgbm"}:
124
+ if not args.vector_index or not args.assessment_ids:
125
+ raise ValueError("Vector/hybrid recommender requires --vector-index and --assessment-ids")
126
+ df_catalog, _, id_by_url = load_catalog(args.catalog)
127
+ with open(args.assessment_ids) as f:
128
+ ids = json.load(f)
129
+ index = VectorIndex.load(args.vector_index)
130
+ embed_model = EmbeddingModel(args.model)
131
+ examples, label_report = load_train(args.train, id_by_url)
132
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
133
+ save_label_resolution_report(label_report, Path(out_dir) / "label_resolution_report.json")
134
+ vocab = {}
135
+ if args.use_rewriter and args.vocab:
136
+ with open(args.vocab) as f:
137
+ vocab = json.load(f)
138
+
139
+ train_split, val_split = split_examples(examples, val_ratio=0.2, seed=args.seed)
140
+ vector_rec = VectorRecommender(embed_model, index, df_catalog, ids, k_candidates=args.topn_candidates)
141
+ if args.recommender == "vector":
142
+ recommender = vector_rec
143
+ elif args.recommender == "hybrid_rrf":
144
+ bm25_rec = BM25Recommender(df_catalog)
145
+ recommender = HybridRRFRecommender(bm25_rec, vector_rec, topn_candidates=args.topn_candidates, rrf_k=args.rrf_k)
146
+ elif args.recommender == "hybrid_rrf_rerank":
147
+ bm25_rec = BM25Recommender(df_catalog)
148
+ reranker = CrossEncoderReranker(model_name=args.reranker_model)
149
+ recommender = HybridRerankRecommender(
150
+ bm25_rec,
151
+ vector_rec,
152
+ reranker,
153
+ df_catalog,
154
+ topn_candidates=args.topn_candidates,
155
+ rrf_k=args.rrf_k,
156
+ )
157
+ else:
158
+ if not args.lgbm_model or not args.lgbm_features:
159
+ raise ValueError("hybrid_rrf_lgbm requires --lgbm-model and --lgbm-features")
160
+ bm25_rec = BM25Recommender(df_catalog)
161
+ feature_cols = json.load(open(args.lgbm_features))
162
+ if isinstance(feature_cols, dict) and "features" in feature_cols:
163
+ feature_cols = feature_cols["features"]
164
+ recommender = HybridRRFLGBMRecommender(
165
+ bm25_rec,
166
+ vector_rec,
167
+ lgbm_model_path=args.lgbm_model,
168
+ feature_cols=feature_cols,
169
+ catalog_df=df_catalog,
170
+ topn_candidates=args.topn_candidates,
171
+ rrf_k=args.rrf_k,
172
+ )
173
+
174
+ def eval_split(split, split_name):
175
+ preds_list = []
176
+ gt_list = []
177
+ rows = []
178
+ for ex in split:
179
+ retrieval_query = ex.query
180
+ rerank_query = ex.query
181
+ if args.use_rewriter:
182
+ rw = rewrite_query(ex.query, catalog_vocab=vocab)
183
+ retrieval_query = rw.retrieval_query
184
+ rerank_query = rw.rerank_query
185
+ if args.recommender == "hybrid_rrf_rerank":
186
+ preds_raw = recommender.recommend(retrieval_query, k=10, rerank_query=rerank_query)
187
+ else:
188
+ preds_raw = recommender.recommend(retrieval_query, k=10)
189
+ preds = []
190
+ for pr in preds_raw:
191
+ if isinstance(pr, str):
192
+ preds.append(pr)
193
+ elif isinstance(pr, dict) and "assessment_id" in pr:
194
+ preds.append(pr["assessment_id"])
195
+ preds = preds[:10]
196
+ preds_list.append(preds)
197
+ gt_list.append(ex.relevant_ids)
198
+ hits = len(set(preds).intersection(ex.relevant_ids))
199
+ rows.append(
200
+ {
201
+ "query": ex.query,
202
+ "relevant_ids": list(ex.relevant_ids),
203
+ "predicted_ids": preds,
204
+ "hits": hits,
205
+ }
206
+ )
207
+ recall10 = sum(recall_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
208
+ recall5 = sum(recall_at_k(g, p, 5) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
209
+ mrr10 = sum(mrr_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
210
+ return recall10, recall5, mrr10, rows
211
+
212
+ train_r10, train_r5, train_mrr10, train_rows = eval_split(train_split, "train")
213
+ val_r10, val_r5, val_mrr10, val_rows = eval_split(val_split, "val")
214
+ metrics = {
215
+ "recommender": args.recommender,
216
+ "label_match_pct": label_report.get("matched_pct"),
217
+ "train": {"recall@10": train_r10, "recall@5": train_r5, "mrr@10": train_mrr10, "n": len(train_split)},
218
+ "val": {"recall@10": val_r10, "recall@5": val_r5, "mrr@10": val_mrr10, "n": len(val_split)},
219
+ "config": {
220
+ "topn_candidates": args.topn_candidates,
221
+ "rrf_k": args.rrf_k,
222
+ "model": args.model,
223
+ "index": args.vector_index,
224
+ },
225
+ }
226
+ with open(Path(out_dir) / "metrics.json", "w") as f:
227
+ json.dump(metrics, f, indent=2)
228
+ pd.DataFrame(train_rows + val_rows).to_json(Path(out_dir) / "per_query_results.jsonl", orient="records", lines=True)
229
+ worst = sorted(val_rows, key=lambda r: r["hits"])[:10]
230
+ pd.DataFrame(worst).to_csv(Path(out_dir) / "worst_queries.csv", index=False)
231
+ print(f"Run saved to {out_dir}")
232
+ else:
233
+ run_eval(args.catalog, args.train, args.recommender, out_dir, seed=args.seed)
234
+ print(f"Run saved to {out_dir}")
235
+
236
+
237
+ if __name__ == "__main__":
238
+ main()
frontend/.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ .env.*
3
+ .next
4
+ node_modules
5
+ npm-debug.log*
6
+ yarn-debug.log*
7
+ yarn-error.log*
8
+ .turbo
9
+ .vercel
frontend/Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:20-alpine AS deps
2
+ WORKDIR /app
3
+ COPY package*.json ./
4
+ RUN npm ci
5
+
6
+ FROM deps AS builder
7
+ WORKDIR /app
8
+ COPY . .
9
+ RUN npm run build
10
+
11
+ FROM node:20-alpine AS runner
12
+ WORKDIR /app
13
+ ENV NODE_ENV=production
14
+ COPY --from=builder /app/.next ./.next
15
+ COPY --from=builder /app/public ./public
16
+ COPY --from=builder /app/package*.json ./
17
+ RUN npm ci --omit=dev
18
+ EXPOSE 3000
19
+ CMD ["npm", "start"]
frontend/index.html ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Assessment Recommender</title>
7
+ <style>
8
+ body { font-family: Arial, sans-serif; margin: 24px; background: #f7f7f7; }
9
+ .container { max-width: 960px; margin: 0 auto; background: #fff; padding: 20px; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.1); }
10
+ textarea, input { width: 100%; }
11
+ label { display: block; margin: 8px 0 4px; font-weight: 600; }
12
+ button { margin-top: 12px; padding: 10px 16px; cursor: pointer; }
13
+ pre { background: #111; color: #0f0; padding: 12px; border-radius: 4px; overflow: auto; max-height: 420px; }
14
+ .row { display: flex; gap: 8px; align-items: center; }
15
+ .row input[type="checkbox"] { width: auto; }
16
+ </style>
17
+ </head>
18
+ <body>
19
+ <div class="container">
20
+ <h2>Assessment Recommender</h2>
21
+ <label for="api">API base URL</label>
22
+ <input id="api" type="text" placeholder="http://localhost:8000" />
23
+
24
+ <label for="query">Query</label>
25
+ <textarea id="query" rows="4" placeholder="Enter your query..."></textarea>
26
+
27
+ <label for="clarification">Clarification (optional)</label>
28
+ <input id="clarification" type="text" placeholder="If a clarification question was asked, answer here" />
29
+
30
+ <div class="row">
31
+ <input id="verbose" type="checkbox" />
32
+ <label for="verbose" style="margin: 0; font-weight: 400;">Verbose (debug)</label>
33
+ </div>
34
+
35
+ <button id="submit">Submit</button>
36
+
37
+ <h3>Response</h3>
38
+ <pre id="output">Awaiting input...</pre>
39
+ </div>
40
+
41
+ <script type="module" src="/static/main.js"></script>
42
+ </body>
43
+ </html>
frontend/next-env.d.ts ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /// <reference types="next" />
2
+ /// <reference types="next/image-types/global" />
3
+
4
+ // NOTE: This file should not be edited
5
+ // see https://nextjs.org/docs/basic-features/typescript for more information.
frontend/next.config.mjs ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /** @type {import('next').NextConfig} */
2
+ const nextConfig = {
3
+ reactStrictMode: false,
4
+ // Static export for hosting on static platforms (Render static site, etc.)
5
+ output: "export"
6
+ };
7
+
8
+ export default nextConfig;
frontend/out/404.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/_next/static/css/e2d6b4bec72e8797.css" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/_next/static/chunks/webpack-879f858537244e02.js"/><script src="/_next/static/chunks/fd9d1056-0eb575322ff5015c.js" async=""></script><script src="/_next/static/chunks/23-02b97631d99e6f05.js" async=""></script><script src="/_next/static/chunks/main-app-df951a18dbec0e17.js" async=""></script><title>404: This page could not be found.</title><title>SHL Assessment Recommender</title><meta name="description" content="Chat + recommendations UI powered by FastAPI backend"/><script src="/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js" noModule=""></script></head><body class="bg-slate-100"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/_next/static/chunks/webpack-879f858537244e02.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/_next/static/css/e2d6b4bec72e8797.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"2:I[5751,[],\"\"]\n4:I[9275,[],\"\"]\n5:I[1343,[],\"\"]\nb:I[6130,[],\"\"]\n6:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n7:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\n8:{\"display\":\"inline-block\"}\n9:{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0}\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/_next/static/css/e2d6b4bec72e8797.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"$L2\",null,{\"buildId\":\"tvK2sxvsuv7CccL1KsVpv\",\"assetPrefix\":\"\",\"initialCanonicalUrl\":\"/_not-found\",\"initialTree\":[\"\",{\"children\":[\"/_not-found\",{\"children\":[\"__PAGE__\",{}]}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"/_not-found\",{\"children\":[\"__PAGE__\",{},[[\"$L3\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]]],null],null]},[\"$\",\"$L4\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"/_not-found\",\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L5\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\",\"styles\":null}],null]},[[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"bg-slate-100\",\"children\":[\"$\",\"$L4\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L5\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$6\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$7\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$9\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null],null],\"couldBeIntercepted\":false,\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"SHL Assessment Recommender\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Chat + recommendations UI powered by FastAPI backend\"}]]\n3:null\n"])</script></body></html>
frontend/out/_next/static/chunks/23-02b97631d99e6f05.js ADDED
The diff for this file is too large to render. See raw diff
 
frontend/out/_next/static/chunks/app/_not-found/page-a99a188ec9244b3f.js ADDED
@@ -0,0 +1 @@
 
 
1
+ (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[409],{7589:function(e,t,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_not-found/page",function(){return n(5457)}])},5457:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),Object.defineProperty(t,"default",{enumerable:!0,get:function(){return s}}),n(9920);let i=n(7437);n(2265);let o={fontFamily:'system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji"',height:"100vh",textAlign:"center",display:"flex",flexDirection:"column",alignItems:"center",justifyContent:"center"},l={display:"inline-block"},r={display:"inline-block",margin:"0 20px 0 0",padding:"0 23px 0 0",fontSize:24,fontWeight:500,verticalAlign:"top",lineHeight:"49px"},d={fontSize:14,fontWeight:400,lineHeight:"49px",margin:0};function s(){return(0,i.jsxs)(i.Fragment,{children:[(0,i.jsx)("title",{children:"404: This page could not be found."}),(0,i.jsx)("div",{style:o,children:(0,i.jsxs)("div",{children:[(0,i.jsx)("style",{dangerouslySetInnerHTML:{__html:"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}),(0,i.jsx)("h1",{className:"next-error-h1",style:r,children:"404"}),(0,i.jsx)("div",{style:l,children:(0,i.jsx)("h2",{style:d,children:"This page could not be found."})})]})})]})}("function"==typeof t.default||"object"==typeof t.default&&null!==t.default)&&void 0===t.default.__esModule&&(Object.defineProperty(t.default,"__esModule",{value:!0}),Object.assign(t.default,t),e.exports=t.default)}},function(e){e.O(0,[971,23,744],function(){return e(e.s=7589)}),_N_E=e.O()}]);
frontend/out/_next/static/chunks/app/layout-fc95adeb217fd9c8.js ADDED
@@ -0,0 +1 @@
 
 
1
+ (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{3711:function(n,e,u){Promise.resolve().then(u.t.bind(u,3054,23))},3054:function(){}},function(n){n.O(0,[141,971,23,744],function(){return n(n.s=3711)}),_N_E=n.O()}]);
frontend/out/_next/static/chunks/app/page-73ea6ec0ec8fa438.js ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[931],{5531:function(e,t,s){Promise.resolve().then(s.bind(s,9306))},9306:function(e,t,s){"use strict";s.r(t),s.d(t,{default:function(){return M}});var a=s(7437),l=s(2265);/**
2
+ * @license lucide-react v0.561.0 - ISC
3
+ *
4
+ * This source code is licensed under the ISC license.
5
+ * See the LICENSE file in the root directory of this source tree.
6
+ */let r=e=>e.replace(/([a-z0-9])([A-Z])/g,"$1-$2").toLowerCase(),n=e=>e.replace(/^([A-Z])|[\s-_]+(\w)/g,(e,t,s)=>s?s.toUpperCase():t.toLowerCase()),i=e=>{let t=n(e);return t.charAt(0).toUpperCase()+t.slice(1)},o=function(){for(var e=arguments.length,t=Array(e),s=0;s<e;s++)t[s]=arguments[s];return t.filter((e,t,s)=>!!e&&""!==e.trim()&&s.indexOf(e)===t).join(" ").trim()},d=e=>{for(let t in e)if(t.startsWith("aria-")||"role"===t||"title"===t)return!0};/**
7
+ * @license lucide-react v0.561.0 - ISC
8
+ *
9
+ * This source code is licensed under the ISC license.
10
+ * See the LICENSE file in the root directory of this source tree.
11
+ */var c={xmlns:"http://www.w3.org/2000/svg",width:24,height:24,viewBox:"0 0 24 24",fill:"none",stroke:"currentColor",strokeWidth:2,strokeLinecap:"round",strokeLinejoin:"round"};/**
12
+ * @license lucide-react v0.561.0 - ISC
13
+ *
14
+ * This source code is licensed under the ISC license.
15
+ * See the LICENSE file in the root directory of this source tree.
16
+ */let u=(0,l.forwardRef)((e,t)=>{let{color:s="currentColor",size:a=24,strokeWidth:r=2,absoluteStrokeWidth:n,className:i="",children:u,iconNode:m,...x}=e;return(0,l.createElement)("svg",{ref:t,...c,width:a,height:a,stroke:s,strokeWidth:n?24*Number(r)/Number(a):r,className:o("lucide",i),...!u&&!d(x)&&{"aria-hidden":"true"},...x},[...m.map(e=>{let[t,s]=e;return(0,l.createElement)(t,s)}),...Array.isArray(u)?u:[u]])}),m=(e,t)=>{let s=(0,l.forwardRef)((s,a)=>{let{className:n,...d}=s;return(0,l.createElement)(u,{ref:a,iconNode:t,className:o("lucide-".concat(r(i(e))),"lucide-".concat(e),n),...d})});return s.displayName=i(e),s},x=m("refresh-cw",[["path",{d:"M3 12a9 9 0 0 1 9-9 9.75 9.75 0 0 1 6.74 2.74L21 8",key:"v9h5vc"}],["path",{d:"M21 3v5h-5",key:"1q7to0"}],["path",{d:"M21 12a9 9 0 0 1-9 9 9.75 9.75 0 0 1-6.74-2.74L3 16",key:"3uifl3"}],["path",{d:"M8 16H3v5",key:"1cv678"}]]),h=m("send",[["path",{d:"M14.536 21.686a.5.5 0 0 0 .937-.024l6.5-19a.496.496 0 0 0-.635-.635l-19 6.5a.5.5 0 0 0-.024.937l7.93 3.18a2 2 0 0 1 1.112 1.11z",key:"1ffxy3"}],["path",{d:"m21.854 2.147-10.94 10.939",key:"12cjpa"}]]),p=m("bug",[["path",{d:"M12 20v-9",key:"1qisl0"}],["path",{d:"M14 7a4 4 0 0 1 4 4v3a6 6 0 0 1-12 0v-3a4 4 0 0 1 4-4z",key:"uouzyp"}],["path",{d:"M14.12 3.88 16 2",key:"qol33r"}],["path",{d:"M21 21a4 4 0 0 0-3.81-4",key:"1b0z45"}],["path",{d:"M21 5a4 4 0 0 1-3.55 3.97",key:"5cxbf6"}],["path",{d:"M22 13h-4",key:"1jl80f"}],["path",{d:"M3 21a4 4 0 0 1 3.81-4",key:"1fjd4g"}],["path",{d:"M3 5a4 4 0 0 0 3.55 3.97",key:"1d7oge"}],["path",{d:"M6 13H2",key:"82j7cp"}],["path",{d:"m8 2 1.88 1.88",key:"fmnt4t"}],["path",{d:"M9 7.13V6a3 3 0 1 1 6 0v1.13",key:"1vgav8"}]]),v=m("settings",[["path",{d:"M9.671 4.136a2.34 2.34 0 0 1 4.659 0 2.34 2.34 0 0 0 3.319 1.915 2.34 2.34 0 0 1 2.33 4.033 2.34 2.34 0 0 0 0 3.831 2.34 2.34 0 0 1-2.33 4.033 2.34 2.34 0 0 0-3.319 1.915 2.34 2.34 0 0 1-4.659 0 2.34 2.34 0 0 0-3.32-1.915 2.34 2.34 0 0 1-2.33-4.033 2.34 2.34 0 0 0 0-3.831A2.34 2.34 0 0 1 6.35 6.051a2.34 2.34 0 0 0 3.319-1.915",key:"1i5ecw"}],["circle",{cx:"12",cy:"12",r:"3",key:"1v7zrd"}]]),f=m("funnel",[["path",{d:"M10 20a1 1 0 0 0 .553.895l2 1A1 1 0 0 0 14 21v-7a2 2 0 0 1 .517-1.341L21.74 4.67A1 1 0 0 0 21 3H3a1 1 0 0 0-.742 1.67l7.225 7.989A2 2 0 0 1 10 14z",key:"sc7q7i"}]]),g=m("search",[["path",{d:"m21 21-4.34-4.34",key:"14j7rj"}],["circle",{cx:"11",cy:"11",r:"8",key:"4ej97u"}]]),b=m("sliders-horizontal",[["path",{d:"M10 5H3",key:"1qgfaw"}],["path",{d:"M12 19H3",key:"yhmn1j"}],["path",{d:"M14 3v4",key:"1sua03"}],["path",{d:"M16 17v4",key:"1q0r14"}],["path",{d:"M21 12h-9",key:"1o4lsq"}],["path",{d:"M21 19h-5",key:"1rlt1p"}],["path",{d:"M21 5h-7",key:"1oszz2"}],["path",{d:"M8 10v4",key:"tgpxqk"}],["path",{d:"M8 12H3",key:"a7s4jb"}]]),j=m("link",[["path",{d:"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71",key:"1cjeqo"}],["path",{d:"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71",key:"19qd67"}]]);async function y(e,t,s){return w("".concat(e.replace(/\/$/,""),"/chat"),t,s)}async function N(e,t,s){return w("".concat(e.replace(/\/$/,""),"/recommend"),t,s)}async function w(e,t,s){let a=new AbortController,l=setTimeout(()=>a.abort(),3e4);try{let l=await fetch(e,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(t),signal:s||a.signal});if(!l.ok){let e=await l.text();throw Error("HTTP ".concat(l.status,": ").concat(e))}return l.json()}finally{clearTimeout(l)}}function k(e,t){let[s,a]=(0,l.useState)(t);return(0,l.useEffect)(()=>{try{let t=localStorage.getItem(e);null!==t&&a(JSON.parse(t))}catch(e){}},[]),(0,l.useEffect)(()=>{try{localStorage.setItem(e,JSON.stringify(s))}catch(e){}},[e,s]),[s,a]}let C=["Java dev + collaboration + 40 minutes","Sales graduate assessment for 60 minutes","Culture fit assessment for COO, 60 minutes"];function M(){var e,t,s;let[r,n]=k("api_base","http://localhost:8000"),[i,o]=k("mode","recommend"),[d,c]=k("verbose",!1),[u,m]=k("llm_model","Qwen/Qwen2.5-1.5B-Instruct"),[w,M]=(0,l.useState)(""),[S,A]=(0,l.useState)(""),[_,z]=(0,l.useState)([]),[L,q]=(0,l.useState)(!1),[E,O]=(0,l.useState)(null),[T,H]=(0,l.useState)({search:"",remote:"any",adaptive:"any",duration:"any",sort:"match"}),R=(0,l.useRef)(null);(0,l.useEffect)(()=>{_.length&&null===E&&O(_.length-1)},[_,E]);let D=null!==E?_[E]:null,I=(null==D?void 0:null===(e=D.response)||void 0===e?void 0:e.recommended_assessments)||(null==D?void 0:null===(t=D.response)||void 0===t?void 0:t.final_results)||[],J=null==D?void 0:null===(s=D.response)||void 0===s?void 0:s.debug,U=(0,l.useMemo)(()=>{let e=[...I],{search:t,remote:s,adaptive:a,duration:l,sort:r}=T;if(t.trim()){let s=t.toLowerCase();e=e.filter(e=>{var t,a,l;return(null===(t=e.name)||void 0===t?void 0:t.toLowerCase().includes(s))||(null===(a=e.description)||void 0===a?void 0:a.toLowerCase().includes(s))||(null===(l=e.test_type)||void 0===l?void 0:l.some(e=>e.toLowerCase().includes(s)))})}return"any"!==s&&(e=e.filter(e=>(e.remote_support||"").toLowerCase()===s.toLowerCase())),"any"!==a&&(e=e.filter(e=>(e.adaptive_support||"").toLowerCase()===a.toLowerCase())),"any"!==l&&(e=e.filter(e=>{let t=e.duration;return null==t?"unknown"===l:"<=20"===l?t<=20:"<=40"===l?t<=40:"<=60"!==l||t<=60})),"short"===r?e.sort((e,t)=>(e.duration||999)-(t.duration||999)):"adaptive"===r&&e.sort((e,t)=>("Yes"===t.adaptive_support?1:0)-("Yes"===e.adaptive_support?1:0)),e},[I,T]),P=async()=>{var e;if(!w.trim())return;q(!0),null===(e=R.current)||void 0===e||e.abort();let t=new AbortController;R.current=t;let s={query:w,verbose:d};S.trim()&&(s.clarification_answer=S.trim()),"recommend"===i&&u&&(s.llm_model=u);let a=crypto.randomUUID(),l=Date.now();z(e=>[...e,{id:a,query:w,response:null,ts:l}]);try{let e="chat"===i?await y(r,s,t.signal):await N(r,s,t.signal);z(t=>t.map(t=>t.id===a?{...t,response:e,error:void 0}:t)),O(_.length),M(""),A("")}catch(e){z(t=>t.map(t=>t.id===a?{...t,error:e.message}:t))}finally{q(!1)}},Y=(0,a.jsxs)("div",{className:"flex items-center justify-between mb-3",children:[(0,a.jsxs)("div",{children:[(0,a.jsx)("h1",{className:"text-3xl font-semibold text-slate-900",children:"SHL Assessment Recommender"}),(0,a.jsx)("p",{className:"text-sm text-slate-600",children:"Chat to get top-10 assessments. Filters and debug on the right."})]}),(0,a.jsxs)("div",{className:"hidden md:flex items-center gap-2 text-xs text-slate-500",children:[(0,a.jsx)(x,{size:16})," Live against FastAPI backend"]})]}),B=(0,a.jsxs)("div",{className:"flex flex-wrap gap-3 text-sm",children:[(0,a.jsxs)("div",{className:"flex items-center gap-2",children:[(0,a.jsx)("label",{className:"font-medium",children:"Mode"}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:i,onChange:e=>o(e.target.value),children:[(0,a.jsx)("option",{value:"recommend",children:"/recommend"}),(0,a.jsx)("option",{value:"chat",children:"/chat"})]})]}),(0,a.jsxs)("div",{className:"flex items-center gap-2",children:[(0,a.jsx)("label",{className:"font-medium",children:"LLM"}),(0,a.jsx)("input",{className:"border rounded px-2 py-1",value:u,onChange:e=>m(e.target.value),placeholder:"Qwen/Qwen2.5-1.5B-Instruct"})]}),(0,a.jsxs)("label",{className:"flex items-center gap-2",children:[(0,a.jsx)("input",{type:"checkbox",checked:d,onChange:e=>c(e.target.checked)}),"Verbose debug"]})]}),Q=(0,a.jsxs)("div",{className:"flex flex-col h-full",children:[(0,a.jsxs)("div",{className:"flex flex-col gap-3 flex-1 overflow-hidden bg-white border rounded-xl shadow-sm p-4",children:[(0,a.jsxs)("div",{className:"flex items-center justify-between",children:[(0,a.jsxs)("div",{className:"text-lg font-semibold flex items-center gap-2",children:[(0,a.jsx)(h,{size:18})," Chat"]}),(0,a.jsx)("button",{onClick:()=>{M(C[0])},className:"text-xs text-blue-600 hover:underline",children:"Use sample"})]}),(0,a.jsxs)("div",{className:"flex gap-2 items-center text-sm",children:[(0,a.jsx)("label",{className:"font-medium min-w-[70px]",children:"API base"}),(0,a.jsx)("input",{className:"border rounded px-2 py-1 w-full",value:r,onChange:e=>n(e.target.value)})]}),(0,a.jsx)("textarea",{className:"border rounded-lg p-3 w-full text-sm min-h-[140px] resize-none focus:ring-2 focus:ring-blue-200",placeholder:"Enter job description or query",value:w,onChange:e=>M(e.target.value),onKeyDown:e=>{"Enter"!==e.key||e.shiftKey||(e.preventDefault(),P())}}),(0,a.jsx)("div",{className:"flex gap-2",children:C.map(e=>(0,a.jsx)("button",{onClick:()=>M(e),className:"text-xs bg-slate-100 hover:bg-slate-200 px-2 py-1 rounded",children:e},e))}),(0,a.jsxs)("div",{className:"flex gap-3 items-center",children:[(0,a.jsx)("input",{className:"border rounded px-2 py-1 text-sm flex-1",placeholder:"Clarification (if asked)",value:S,onChange:e=>A(e.target.value)}),(0,a.jsxs)("button",{onClick:P,disabled:L,className:"bg-blue-600 text-white px-4 py-2 rounded-lg flex items-center gap-2 hover:bg-blue-700 disabled:opacity-60",children:[(0,a.jsx)(h,{size:16})," ",L?"Sending...":"Send"]}),(0,a.jsx)("button",{onClick:()=>c(!d),className:"p-2 border rounded-lg hover:bg-slate-100",title:"Toggle verbose debug",children:(0,a.jsx)(p,{size:16})}),(0,a.jsx)("button",{onClick:()=>o("recommend"===i?"chat":"recommend"),className:"p-2 border rounded-lg hover:bg-slate-100",title:"Toggle endpoint",children:(0,a.jsx)(v,{size:16})})]}),B]}),(0,a.jsxs)("div",{className:"mt-3 bg-white border rounded-xl shadow-sm p-3 text-sm text-slate-600 max-h-48 overflow-auto",children:[(0,a.jsx)("div",{className:"font-semibold mb-2",children:"History"}),0===_.length&&(0,a.jsx)("div",{className:"text-slate-400",children:"No queries yet."}),_.map((e,t)=>(0,a.jsxs)("button",{onClick:()=>O(t),className:"block w-full text-left px-2 py-1 rounded ".concat(t===E?"bg-blue-50 text-blue-700":"hover:bg-slate-100"),children:[(0,a.jsx)("div",{className:"font-medium text-sm truncate",children:e.query}),(0,a.jsx)("div",{className:"text-xs text-slate-500",children:new Date(e.ts).toLocaleTimeString()}),e.error&&(0,a.jsxs)("div",{className:"text-xs text-red-600",children:["Error: ",e.error]})]},e.id))]})]}),$=(0,a.jsxs)("div",{className:"flex flex-col h-full",children:[(0,a.jsxs)("div",{className:"bg-white border rounded-xl shadow-sm p-4 flex flex-col gap-3",children:[(0,a.jsxs)("div",{className:"flex items-center justify-between",children:[(0,a.jsxs)("div",{className:"text-lg font-semibold flex items-center gap-2",children:[(0,a.jsx)(f,{size:18})," Results"]}),(0,a.jsxs)("div",{className:"flex items-center gap-2",children:[(0,a.jsxs)("div",{className:"relative",children:[(0,a.jsx)(g,{className:"absolute left-2 top-2.5 h-4 w-4 text-slate-400"}),(0,a.jsx)("input",{className:"pl-8 pr-3 py-2 border rounded-lg text-sm",placeholder:"Search results",value:T.search,onChange:e=>H(t=>({...t,search:e.target.value}))})]}),(0,a.jsx)(b,{size:16,className:"text-slate-500"})]})]}),(0,a.jsxs)("div",{className:"flex flex-wrap gap-3 text-xs",children:[(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.remote,onChange:e=>H(t=>({...t,remote:e.target.value})),children:[(0,a.jsx)("option",{value:"any",children:"Remote: Any"}),(0,a.jsx)("option",{value:"Yes",children:"Remote: Yes"}),(0,a.jsx)("option",{value:"No",children:"Remote: No"})]}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.adaptive,onChange:e=>H(t=>({...t,adaptive:e.target.value})),children:[(0,a.jsx)("option",{value:"any",children:"Adaptive: Any"}),(0,a.jsx)("option",{value:"Yes",children:"Adaptive: Yes"}),(0,a.jsx)("option",{value:"No",children:"Adaptive: No"})]}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.duration,onChange:e=>H(t=>({...t,duration:e.target.value})),children:[(0,a.jsx)("option",{value:"any",children:"Duration: Any"}),(0,a.jsx)("option",{value:"<=20",children:"≤ 20 min"}),(0,a.jsx)("option",{value:"<=40",children:"≤ 40 min"}),(0,a.jsx)("option",{value:"<=60",children:"≤ 60 min"}),(0,a.jsx)("option",{value:"unknown",children:"Unknown only"})]}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.sort,onChange:e=>H(t=>({...t,sort:e.target.value})),children:[(0,a.jsx)("option",{value:"match",children:"Sort: Best match"}),(0,a.jsx)("option",{value:"short",children:"Sort: Shortest"}),(0,a.jsx)("option",{value:"adaptive",children:"Sort: Adaptive first"})]})]}),(0,a.jsxs)("div",{className:"grid md:grid-cols-2 lg:grid-cols-2 gap-3",children:[0===U.length&&(0,a.jsx)("div",{className:"text-sm text-slate-500",children:"No results yet. Submit a query to see recommendations."}),U.map((e,t)=>{var s;return(0,a.jsxs)("div",{className:"border rounded-xl p-4 shadow-sm hover:shadow-md transition bg-slate-50",children:[(0,a.jsxs)("div",{className:"flex items-start justify-between gap-2",children:[(0,a.jsx)("a",{href:e.url,target:"_blank",rel:"noreferrer",className:"font-semibold text-slate-900 hover:text-blue-600",children:e.name||"Untitled"}),(0,a.jsx)("button",{className:"text-slate-500 hover:text-blue-600",onClick:()=>e.url&&navigator.clipboard.writeText(e.url),children:(0,a.jsx)(j,{size:16})})]}),(0,a.jsxs)("div",{className:"flex flex-wrap gap-2 mt-2",children:[null===(s=e.test_type)||void 0===s?void 0:s.map(e=>(0,a.jsx)("span",{className:"text-[11px] bg-blue-50 text-blue-700 px-2 py-1 rounded-full border border-blue-100",children:e},e)),(0,a.jsx)("span",{className:"text-[11px] bg-slate-100 text-slate-700 px-2 py-1 rounded-full border border-slate-200",children:e.duration?"".concat(e.duration," min"):"Duration unknown"}),(0,a.jsxs)("span",{className:"text-[11px] bg-emerald-50 text-emerald-700 px-2 py-1 rounded-full border border-emerald-100",children:["Remote: ",e.remote_support||"?"]}),(0,a.jsxs)("span",{className:"text-[11px] bg-indigo-50 text-indigo-700 px-2 py-1 rounded-full border border-indigo-100",children:["Adaptive: ",e.adaptive_support||"?"]})]}),(0,a.jsx)("p",{className:"text-sm text-slate-700 mt-2 overflow-hidden text-ellipsis",children:e.description||"No description."})]},t)})]})]}),d&&J&&(0,a.jsxs)("div",{className:"mt-3 bg-white border rounded-xl shadow-sm p-4",children:[(0,a.jsxs)("div",{className:"flex items-center gap-2 text-sm font-semibold mb-2",children:[(0,a.jsx)(p,{size:16})," Debug"]}),(0,a.jsxs)("div",{className:"grid md:grid-cols-2 gap-3 text-xs",children:[(0,a.jsxs)("div",{className:"bg-slate-50 border rounded p-2",children:[(0,a.jsx)("div",{className:"font-semibold mb-1",children:"Plan"}),(0,a.jsx)("pre",{className:"overflow-auto max-h-48 text-slate-700",children:JSON.stringify(J.plan,null,2)})]}),J.fusion&&(0,a.jsxs)("div",{className:"bg-slate-50 border rounded p-2",children:[(0,a.jsx)("div",{className:"font-semibold mb-1",children:"Fusion"}),(0,a.jsx)("pre",{className:"overflow-auto max-h-48 text-slate-700",children:JSON.stringify(J.fusion,null,2)})]}),J.candidates&&(0,a.jsxs)("div",{className:"bg-slate-50 border rounded p-2 col-span-2",children:[(0,a.jsx)("div",{className:"font-semibold mb-1",children:"Top candidates"}),(0,a.jsx)("pre",{className:"overflow-auto max-h-60 text-slate-700",children:JSON.stringify(J.candidates,null,2)})]})]})]})]});return(0,a.jsx)("main",{className:"min-h-screen bg-slate-100",children:(0,a.jsxs)("div",{className:"app-shell py-6",children:[Y,(0,a.jsxs)("div",{className:"grid lg:grid-cols-2 gap-6 mt-4",children:[Q,$]})]})})}}},function(e){e.O(0,[971,23,744],function(){return e(e.s=5531)}),_N_E=e.O()}]);
frontend/out/_next/static/chunks/fd9d1056-0eb575322ff5015c.js ADDED
The diff for this file is too large to render. See raw diff
 
frontend/out/_next/static/chunks/framework-aec844d2ccbe7592.js ADDED
The diff for this file is too large to render. See raw diff
 
frontend/out/_next/static/chunks/main-app-df951a18dbec0e17.js ADDED
@@ -0,0 +1 @@
 
 
1
+ (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{3332:function(e,n,t){Promise.resolve().then(t.t.bind(t,5751,23)),Promise.resolve().then(t.t.bind(t,6513,23)),Promise.resolve().then(t.t.bind(t,6130,23)),Promise.resolve().then(t.t.bind(t,9275,23)),Promise.resolve().then(t.t.bind(t,5324,23)),Promise.resolve().then(t.t.bind(t,1343,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,23],function(){return n(1028),n(3332)}),_N_E=e.O()}]);