Spaces:
Sleeping
Sleeping
github-actions commited on
Commit ·
5a3b322
1
Parent(s): c44ff26
Sync from GitHub 2025-12-17T12:18:53Z
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .DS_Store +0 -0
- .dockerignore +27 -0
- .env.example +16 -0
- .gitattributes +0 -35
- .github/workflows/push_to_hf_space.yml +38 -0
- .gitignore +20 -0
- Dockerfile +23 -0
- Makefile +30 -0
- README.md +149 -12
- agent/app.py +76 -0
- agent/chat_agent.py +156 -0
- agent/router_agent.py +86 -0
- agent/server.py +324 -0
- api/__init__.py +1 -0
- config.py +75 -0
- configs/__init__.py +1 -0
- configs/config.yaml +43 -0
- configs/embedding_config.yaml +10 -0
- configs/retrieval.yaml +5 -0
- crawler/__init__.py +1 -0
- crawler/backfill_labels.py +72 -0
- crawler/export.py +94 -0
- crawler/fetcher.py +102 -0
- crawler/parser_catalog.py +143 -0
- crawler/parser_detail.py +320 -0
- crawler/qa_checks.py +74 -0
- crawler/robots.py +35 -0
- crawler/run.py +165 -0
- crawler/storage.py +209 -0
- crawler/utils.py +61 -0
- docker-compose.yml +26 -0
- embeddings/generator.py +68 -0
- eval/__init__.py +1 -0
- eval/compare_runs.py +34 -0
- eval/diagnostic_topk.py +88 -0
- eval/metrics.py +27 -0
- eval/run_eval.py +238 -0
- frontend/.dockerignore +9 -0
- frontend/Dockerfile +19 -0
- frontend/index.html +43 -0
- frontend/next-env.d.ts +5 -0
- frontend/next.config.mjs +8 -0
- frontend/out/404.html +1 -0
- frontend/out/_next/static/chunks/23-02b97631d99e6f05.js +0 -0
- frontend/out/_next/static/chunks/app/_not-found/page-a99a188ec9244b3f.js +1 -0
- frontend/out/_next/static/chunks/app/layout-fc95adeb217fd9c8.js +1 -0
- frontend/out/_next/static/chunks/app/page-73ea6ec0ec8fa438.js +16 -0
- frontend/out/_next/static/chunks/fd9d1056-0eb575322ff5015c.js +0 -0
- frontend/out/_next/static/chunks/framework-aec844d2ccbe7592.js +0 -0
- frontend/out/_next/static/chunks/main-app-df951a18dbec0e17.js +1 -0
.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
.dockerignore
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.env
|
| 4 |
+
.env.*
|
| 5 |
+
.venv
|
| 6 |
+
venv
|
| 7 |
+
__pycache__
|
| 8 |
+
*.pyc
|
| 9 |
+
*.pyo
|
| 10 |
+
.pytest_cache
|
| 11 |
+
.mypy_cache
|
| 12 |
+
.ruff_cache
|
| 13 |
+
.model_cache
|
| 14 |
+
.cache
|
| 15 |
+
node_modules
|
| 16 |
+
frontend/node_modules
|
| 17 |
+
frontend/.next
|
| 18 |
+
frontend/.turbo
|
| 19 |
+
frontend/.vercel
|
| 20 |
+
runs
|
| 21 |
+
logs
|
| 22 |
+
*.log
|
| 23 |
+
*.tmp
|
| 24 |
+
*.swp
|
| 25 |
+
*.swo
|
| 26 |
+
*.orig
|
| 27 |
+
*.DS_Store
|
.env.example
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Application
|
| 2 |
+
APP_ENV=local
|
| 3 |
+
LOG_LEVEL=INFO
|
| 4 |
+
CONFIG_PATH=configs/config.yaml
|
| 5 |
+
USER_AGENT=llm-recommendation-engine/0.1 (+https://example.com)
|
| 6 |
+
START_URL=https://www.shl.com/products/product-catalog/
|
| 7 |
+
MAX_CONCURRENCY=2
|
| 8 |
+
REQUEST_DELAY_SECONDS=1.5
|
| 9 |
+
JITTER_SECONDS=0.5
|
| 10 |
+
MAX_RETRIES=3
|
| 11 |
+
ALLOW_ROBOTS_BYPASS=0
|
| 12 |
+
|
| 13 |
+
# External services / secrets
|
| 14 |
+
OPENAI_API_KEY=replace_me
|
| 15 |
+
VECTOR_DB_URL=replace_me
|
| 16 |
+
TRACING_ENDPOINT=replace_me
|
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.github/workflows/push_to_hf_space.yml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Push to Hugging Face Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ "main" ]
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
sync-to-hf:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- name: Checkout
|
| 12 |
+
uses: actions/checkout@v4
|
| 13 |
+
with:
|
| 14 |
+
lfs: true
|
| 15 |
+
|
| 16 |
+
- name: Push to HF Space
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: |
|
| 20 |
+
git config --global user.email "ci@github"
|
| 21 |
+
git config --global user.name "github-actions"
|
| 22 |
+
|
| 23 |
+
# Clone the Space repo
|
| 24 |
+
git clone https://AgamP:$HF_TOKEN@huggingface.co/spaces/AgamP/llm_recommendation_backend hf_space
|
| 25 |
+
|
| 26 |
+
# Replace Space contents with GitHub repo contents (except .git)
|
| 27 |
+
|
| 28 |
+
rsync -av --delete \
|
| 29 |
+
--exclude ".git" \
|
| 30 |
+
--exclude "hf_space" \
|
| 31 |
+
--exclude "*.pdf" \
|
| 32 |
+
./ hf_space/
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
cd hf_space
|
| 36 |
+
git add -A
|
| 37 |
+
git commit -m "Sync from GitHub $(date -u +'%Y-%m-%dT%H:%M:%SZ')" || echo "No changes"
|
| 38 |
+
git push
|
.gitignore
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
.pytest_cache/
|
| 4 |
+
*.sqlite
|
| 5 |
+
*.db
|
| 6 |
+
logs/
|
| 7 |
+
data/crawler.db
|
| 8 |
+
data/catalog.parquet
|
| 9 |
+
data/catalog.jsonl
|
| 10 |
+
playwright-report/
|
| 11 |
+
playwright/.cache/
|
| 12 |
+
node_modules/
|
| 13 |
+
venv/
|
| 14 |
+
runs/
|
| 15 |
+
frontend/.next/
|
| 16 |
+
frontend/node_modules/
|
| 17 |
+
.model_cache/
|
| 18 |
+
data
|
| 19 |
+
data/
|
| 20 |
+
models/reranker_crossenc/v0.1.0/
|
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# System deps for numpy/faiss/scipy style builds; drop if wheels suffice
|
| 10 |
+
RUN apt-get update && \
|
| 11 |
+
apt-get install -y --no-install-recommends build-essential && \
|
| 12 |
+
rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy source after deps to leverage Docker layer caching
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
EXPOSE 8000
|
| 21 |
+
# Render sets PORT; default to 8000 for local use
|
| 22 |
+
ENV PORT=8000
|
| 23 |
+
CMD ["sh", "-c", "uvicorn agent.server:app --host 0.0.0.0 --port ${PORT} --workers 2"]
|
Makefile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PYTHON ?= python3
|
| 2 |
+
VENV ?= .venv
|
| 3 |
+
ACTIVATE = . $(VENV)/bin/activate
|
| 4 |
+
APP_NAME ?= llm-recommender
|
| 5 |
+
|
| 6 |
+
.PHONY: setup install config-check lint test docker-build docker-run clean
|
| 7 |
+
|
| 8 |
+
setup:
|
| 9 |
+
$(PYTHON) -m venv $(VENV)
|
| 10 |
+
|
| 11 |
+
install: setup
|
| 12 |
+
$(ACTIVATE) && pip install --upgrade pip && pip install -r requirements.txt
|
| 13 |
+
|
| 14 |
+
config-check:
|
| 15 |
+
$(ACTIVATE) && PYTHONPATH=. $(PYTHON) config.py --print
|
| 16 |
+
|
| 17 |
+
lint:
|
| 18 |
+
@echo "Add linting tools (ruff/black/flake8) here"
|
| 19 |
+
|
| 20 |
+
test:
|
| 21 |
+
$(ACTIVATE) && PYTHONPATH=. pytest
|
| 22 |
+
|
| 23 |
+
docker-build:
|
| 24 |
+
docker build -t $(APP_NAME):dev .
|
| 25 |
+
|
| 26 |
+
docker-run:
|
| 27 |
+
docker run --rm -it -p 8000:8000 -p 3000:3000 --env-file .env.example $(APP_NAME):dev
|
| 28 |
+
|
| 29 |
+
clean:
|
| 30 |
+
rm -rf $(VENV) __pycache__ */__pycache__
|
README.md
CHANGED
|
@@ -1,12 +1,149 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# llm_recommendation_engine
|
| 2 |
+
Recommendation engine for SHL's product catalogue with conversational agents
|
| 3 |
+
|
| 4 |
+
## Quick commands (crawler + export + QA)
|
| 5 |
+
- Install deps (and Playwright browser): `python -m pip install -r requirements.txt && python -m playwright install chromium`
|
| 6 |
+
- Clean DB: `rm -f data/crawler.db`
|
| 7 |
+
- Crawl (bypass robots if needed): `ALLOW_ROBOTS_BYPASS=1 python -m crawler.run --mode=crawl_all --max-discover=20`
|
| 8 |
+
- Drop `--max-discover` for full crawl.
|
| 9 |
+
- Export dataset: `python -m crawler.run --mode=export --limit-export=20`
|
| 10 |
+
- Outputs: `data/catalog.parquet`, `data/catalog.jsonl`
|
| 11 |
+
- Drop `--limit-export` for full export.
|
| 12 |
+
- QA checks: `python -m crawler.qa_checks data/catalog.jsonl > data/qa_summary.json`
|
| 13 |
+
- Summary JSON saved to `data/qa_summary.json`
|
| 14 |
+
|
| 15 |
+
## What’s implemented
|
| 16 |
+
- Playwright-based crawler with catalog pagination, detail fetch, and structured storage in SQLite.
|
| 17 |
+
- Field extraction: url, name, description, test_type (+full), remote/adaptive flags, duration (minutes/hours), job_levels, languages, downloads.
|
| 18 |
+
- Export to Parquet/JSONL plus QA summary script for downstream sanity checks.
|
| 19 |
+
|
| 20 |
+
## Evaluation harness (Phase 2)
|
| 21 |
+
- Catalog loader with canonical IDs: `python -m data.catalog_loader --input data/catalog.jsonl --output data/catalog_with_ids.jsonl`
|
| 22 |
+
- Train loader + label resolution report: `python -m data.train_loader --catalog data/catalog.jsonl --train <train_file> --report data/label_resolution_report.json`
|
| 23 |
+
- Run eval (dummy baseline): `python -m eval.run_eval --catalog data/catalog.jsonl --train <train_file> --recommender dummy_random`
|
| 24 |
+
- Run eval (BM25 baseline): `python -m eval.run_eval --catalog data/catalog.jsonl --train <train_file> --recommender bm25`
|
| 25 |
+
- Outputs run folder under `runs/<timestamp>_<recommender>/` with `metrics.json`, `per_query_results.jsonl`, `worst_queries.csv`, `label_resolution_report.json`
|
| 26 |
+
- Compare runs: `python -m eval.compare_runs runs/<run_a> runs/<run_b>`
|
| 27 |
+
|
| 28 |
+
Recommender interface lives in `recommenders/base.py`; a random baseline is in `recommenders/dummy_random.py`. Metrics (Recall@k, MRR@10) are in `eval/metrics.py`.
|
| 29 |
+
|
| 30 |
+
## Label probing & backfill (improve label coverage)
|
| 31 |
+
- Probe unmatched label URLs (after a label match run): `python -m scripts.probe_unmatched_labels --labels data/label_resolution_report.json --output reports/label_url_probe.csv` — classifies label URLs (valid detail vs 404/blocked).
|
| 32 |
+
- Backfill valid label pages into DB: `python -m crawler.backfill_labels --probe-csv reports/label_url_probe.csv --allow-robots-bypass` — fetches & inserts DETAIL_PAGE_VALID URLs.
|
| 33 |
+
- Re-export and rematch after backfill:
|
| 34 |
+
- `python -m crawler.run --mode=export`
|
| 35 |
+
- `python -m data.catalog_loader --input data/catalog.jsonl --output data/catalog_with_ids.jsonl`
|
| 36 |
+
- `python -m data.train_loader --catalog data/catalog.jsonl --train <train_file> --sheet "Train-Set" --report data/label_resolution_report.json`
|
| 37 |
+
|
| 38 |
+
## Vector pipeline (semantic retrieval)
|
| 39 |
+
- Build doc_text: `python -m data.document_builder --input data/catalog.jsonl --output data/catalog_docs.jsonl`
|
| 40 |
+
- Generate embeddings: `python -m embeddings.generator --catalog data/catalog_docs.jsonl --model sentence-transformers/all-MiniLM-L6-v2 --output-dir data/embeddings`
|
| 41 |
+
- Build FAISS index: `python -m retrieval.build_index --embeddings data/embeddings/embeddings.npy --ids data/embeddings/assessment_ids.json --index-path data/faiss_index/index.faiss`
|
| 42 |
+
- Vector components:
|
| 43 |
+
- Model wrapper: `models/embedding_model.py`
|
| 44 |
+
- Index wrapper: `retrieval/vector_index.py`
|
| 45 |
+
- Index builder script: `retrieval/build_index.py`
|
| 46 |
+
- Vector recommender scaffold: `recommenders/vector_recommender.py` (wire with assessment_ids + index)
|
| 47 |
+
|
| 48 |
+
## Hybrid retrieval (BM25 + vector with RRF)
|
| 49 |
+
- Run hybrid eval: `python -m eval.run_eval --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --recommender hybrid_rrf --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn-candidates 200 --rrf-k 60`
|
| 50 |
+
- Run hybrid + cross-encoder rerank: `python -m eval.run_eval --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --recommender hybrid_rrf_rerank --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2 --topn-candidates 200 --rrf-k 60`
|
| 51 |
+
- Run hybrid + LGBM rerank: `python -m eval.run_eval --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --recommender hybrid_rrf_lgbm --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn-candidates 200 --rrf-k 60 --lgbm-model models/reranker/v0.1.0/lgbm_model.txt --lgbm-features models/reranker/v0.1.0/feature_schema.json`
|
| 52 |
+
- Diagnostics (positives in top-N vs top-10): `python -m eval.diagnostic_topk --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn 200`
|
| 53 |
+
- Run ablation (bm25/vector/hybrid across topN): `python -m scripts.run_ablation --catalog data/catalog_docs.jsonl --train data/Gen_AI\ Dataset.xlsx --vector-index data/faiss_index/index.faiss --assessment-ids data/embeddings/assessment_ids.json --model sentence-transformers/all-MiniLM-L6-v2 --topn-list 100,200,377`
|
| 54 |
+
|
| 55 |
+
## Current findings & next steps
|
| 56 |
+
- Candidate coverage is solved by top200; ranking is the bottleneck. Use union fusion + rerank.
|
| 57 |
+
- Locked decisions:
|
| 58 |
+
- Candidate pool (train): top200
|
| 59 |
+
- Candidate pool (infer): top100–200
|
| 60 |
+
- Base retriever: hybrid (BM25 + vector), union fusion, dual-query (raw + rewritten).
|
| 61 |
+
- Next: focus on reranking and constraint handling; no more embedding/model swaps.
|
| 62 |
+
|
| 63 |
+
## Core pipeline (concise commands)
|
| 64 |
+
|
| 65 |
+
### Build rich docs, embeddings, index (BGE)
|
| 66 |
+
```bash
|
| 67 |
+
python -m data.document_builder \
|
| 68 |
+
--input data/catalog.jsonl \
|
| 69 |
+
--output data/catalog_docs_rich.jsonl \
|
| 70 |
+
--variant rich \
|
| 71 |
+
--version v2_struct
|
| 72 |
+
|
| 73 |
+
python -m embeddings.generator \
|
| 74 |
+
--catalog data/catalog_docs_rich.jsonl \
|
| 75 |
+
--model BAAI/bge-small-en-v1.5 \
|
| 76 |
+
--batch-size 32 \
|
| 77 |
+
--output-dir data/embeddings_bge
|
| 78 |
+
|
| 79 |
+
python -m retrieval.build_index \
|
| 80 |
+
--embeddings data/embeddings_bge/embeddings.npy \
|
| 81 |
+
--ids data/embeddings_bge/assessment_ids.json \
|
| 82 |
+
--index-path data/faiss_index/index_bge.faiss
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Build vocab for query rewriter (optional, recommended)
|
| 86 |
+
```bash
|
| 87 |
+
python -m scripts.build_role_vocab \
|
| 88 |
+
--catalog data/catalog_docs_rich.jsonl \
|
| 89 |
+
--out data/catalog_role_vocab.json
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
### Evaluate hybrid + cross-encoder rerank (with rewriting and union fusion)
|
| 93 |
+
```bash
|
| 94 |
+
python -m eval.run_eval \
|
| 95 |
+
--catalog data/catalog_docs_rich.jsonl \
|
| 96 |
+
--train data/Gen_AI\ Dataset.xlsx \
|
| 97 |
+
--recommender hybrid_rrf_rerank \
|
| 98 |
+
--vector-index data/faiss_index/index_bge.faiss \
|
| 99 |
+
--assessment-ids data/embeddings_bge/assessment_ids.json \
|
| 100 |
+
--model BAAI/bge-small-en-v1.5 \
|
| 101 |
+
--reranker-model models/reranker_crossenc/v0.1.0 \
|
| 102 |
+
--topn-candidates 200 --rrf-k 60 \
|
| 103 |
+
--use-rewriter --vocab data/catalog_role_vocab.json \
|
| 104 |
+
--out-dir runs/$(date +%Y%m%d_%H%M%S)_hybrid_rrf_rerank_rewrite
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### Candidate coverage (bm25 vs vector vs hybrid; grouped per query)
|
| 108 |
+
```bash
|
| 109 |
+
python -m scripts.candidate_coverage \
|
| 110 |
+
--catalog data/catalog_docs_rich.jsonl \
|
| 111 |
+
--train data/Gen_AI\ Dataset.xlsx \
|
| 112 |
+
--vector-index data/faiss_index/index_bge.faiss \
|
| 113 |
+
--assessment-ids data/embeddings_bge/assessment_ids.json \
|
| 114 |
+
--embedding-model BAAI/bge-small-en-v1.5 \
|
| 115 |
+
--topn 200 \
|
| 116 |
+
--use-rewriter --vocab data/catalog_role_vocab.json \
|
| 117 |
+
--out runs/candidate_coverage.jsonl
|
| 118 |
+
|
| 119 |
+
python -m scripts.summarize_candidate_coverage \
|
| 120 |
+
--input runs/candidate_coverage.jsonl \
|
| 121 |
+
--out runs/candidate_coverage_stats.json
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
### Rewrite impact (optional)
|
| 125 |
+
```bash
|
| 126 |
+
python -m scripts.eval_rewrite_impact \
|
| 127 |
+
--catalog data/catalog_docs_rich.jsonl \
|
| 128 |
+
--train data/Gen_AI\ Dataset.xlsx \
|
| 129 |
+
--vector-index data/faiss_index/index_bge.faiss \
|
| 130 |
+
--assessment-ids data/embeddings_bge/assessment_ids.json \
|
| 131 |
+
--embedding-model BAAI/bge-small-en-v1.5 \
|
| 132 |
+
--topn 200 \
|
| 133 |
+
--vocab data/catalog_role_vocab.json \
|
| 134 |
+
--out runs/rewrite_impact.jsonl
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
## Frontend + backend (Next.js + FastAPI)
|
| 138 |
+
|
| 139 |
+
Backend (FastAPI):
|
| 140 |
+
- Start: `uvicorn agent.server:app --reload --port 8000`
|
| 141 |
+
- Health: `GET /health`
|
| 142 |
+
- Chat: `POST /chat` (returns compact top-10 + optional summary when verbose=true)
|
| 143 |
+
- Recommend: `POST /recommend` with `{"query": "..."}` returns `{"recommended_assessments": [...]}` (top-10)
|
| 144 |
+
|
| 145 |
+
Frontend (Next.js in `frontend/`):
|
| 146 |
+
- Install deps: `cd frontend && npm install`
|
| 147 |
+
- Dev: `npm run dev` (will start on port 3000; ensure backend is running on 8000 or set API base in UI)
|
| 148 |
+
- Build/start: `npm run build && npm run start`
|
| 149 |
+
- UI is at `http://localhost:3000/` (API base defaults to `http://localhost:8000`, editable in the UI)
|
agent/app.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Lightweight agent harness (no LangChain server) to demonstrate the tool stack end-to-end.
|
| 5 |
+
This keeps ranking deterministic; LLM can be plugged later for structured QueryPlan.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
from typing import Callable
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
from data.catalog_loader import load_catalog
|
| 14 |
+
from recommenders.bm25 import BM25Recommender
|
| 15 |
+
from recommenders.vector_recommender import VectorRecommender
|
| 16 |
+
from retrieval.vector_index import VectorIndex
|
| 17 |
+
from models.embedding_model import EmbeddingModel
|
| 18 |
+
from rerankers.cross_encoder import CrossEncoderReranker
|
| 19 |
+
|
| 20 |
+
from tools.query_plan_tool import build_query_plan
|
| 21 |
+
from tools.retrieve_tool import retrieve_candidates
|
| 22 |
+
from tools.rerank_tool import rerank_candidates
|
| 23 |
+
from tools.constraints_tool import apply_constraints
|
| 24 |
+
from tools.explain_tool import explain
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def load_resources():
|
| 28 |
+
df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
|
| 29 |
+
bm25 = BM25Recommender(df_catalog)
|
| 30 |
+
embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
|
| 31 |
+
index = VectorIndex.load("data/faiss_index/index_bge.faiss")
|
| 32 |
+
with open("data/embeddings_bge/assessment_ids.json") as f:
|
| 33 |
+
ids = json.load(f)
|
| 34 |
+
vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
|
| 35 |
+
return df_catalog, bm25, vec
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def make_catalog_lookup(df_catalog: pd.DataFrame) -> Callable[[str], dict]:
|
| 39 |
+
cat = df_catalog.set_index("assessment_id")
|
| 40 |
+
|
| 41 |
+
def lookup(aid: str) -> dict:
|
| 42 |
+
if aid in cat.index:
|
| 43 |
+
return cat.loc[aid].to_dict()
|
| 44 |
+
return {}
|
| 45 |
+
|
| 46 |
+
return lookup
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def run_query(user_text: str, vocab_path="data/catalog_role_vocab.json"):
|
| 50 |
+
vocab = json.load(open(vocab_path)) if vocab_path else {}
|
| 51 |
+
df_catalog, bm25, vec = load_resources()
|
| 52 |
+
catalog_lookup = make_catalog_lookup(df_catalog)
|
| 53 |
+
|
| 54 |
+
# Step 1: plan (deterministic rewriter for now; swap with LLM structured plan if desired)
|
| 55 |
+
plan = build_query_plan(user_text, vocab=vocab)
|
| 56 |
+
|
| 57 |
+
# Step 2: retrieve (union)
|
| 58 |
+
cand_set = retrieve_candidates(plan, bm25, vec, topn=200, catalog_df=df_catalog)
|
| 59 |
+
|
| 60 |
+
# Step 3: rerank (use best reranker)
|
| 61 |
+
reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
|
| 62 |
+
ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
|
| 63 |
+
|
| 64 |
+
# Step 4: constraints (hook; currently passthrough)
|
| 65 |
+
final_list = apply_constraints(plan, ranked)
|
| 66 |
+
|
| 67 |
+
# Step 5: explanation
|
| 68 |
+
summary = explain(plan, final_list, catalog_lookup)
|
| 69 |
+
return summary
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
import sys
|
| 74 |
+
|
| 75 |
+
user_text = " ".join(sys.argv[1:]) or "Find a 1 hour culture fit assessment for a COO"
|
| 76 |
+
print(run_query(user_text))
|
agent/chat_agent.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Chat-style agent using Gemini for planning + explanation, deterministic tools for retrieval/rerank.
|
| 5 |
+
Set GOOGLE_API_KEY in your environment.
|
| 6 |
+
"""
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
from typing import Callable
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
from data.catalog_loader import load_catalog
|
| 14 |
+
from recommenders.bm25 import BM25Recommender
|
| 15 |
+
from recommenders.vector_recommender import VectorRecommender
|
| 16 |
+
from retrieval.vector_index import VectorIndex
|
| 17 |
+
from models.embedding_model import EmbeddingModel
|
| 18 |
+
from rerankers.cross_encoder import CrossEncoderReranker
|
| 19 |
+
|
| 20 |
+
from tools.query_plan_tool_llm import build_query_plan_llm
|
| 21 |
+
from tools.query_plan_tool import build_query_plan as deterministic_plan
|
| 22 |
+
from tools.retrieve_tool import retrieve_candidates
|
| 23 |
+
from tools.rerank_tool import rerank_candidates
|
| 24 |
+
from tools.constraints_tool import apply_constraints
|
| 25 |
+
from tools.explain_tool import explain
|
| 26 |
+
from schemas.query_plan import QueryPlan
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def load_resources():
|
| 30 |
+
df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
|
| 31 |
+
bm25 = BM25Recommender(df_catalog)
|
| 32 |
+
embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
|
| 33 |
+
index = VectorIndex.load("data/faiss_index/index_bge.faiss")
|
| 34 |
+
with open("data/embeddings_bge/assessment_ids.json") as f:
|
| 35 |
+
ids = json.load(f)
|
| 36 |
+
vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
|
| 37 |
+
catalog_by_id = {row["assessment_id"]: row for _, row in df_catalog.iterrows()}
|
| 38 |
+
return df_catalog, bm25, vec, catalog_by_id
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def make_catalog_lookup(df_catalog: pd.DataFrame) -> Callable[[str], dict]:
|
| 42 |
+
cat = df_catalog.set_index("assessment_id")
|
| 43 |
+
|
| 44 |
+
def lookup(aid: str) -> dict:
|
| 45 |
+
if aid in cat.index:
|
| 46 |
+
return cat.loc[aid].to_dict()
|
| 47 |
+
return {}
|
| 48 |
+
|
| 49 |
+
return lookup
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _maybe_clarify(plan: QueryPlan, cand_count: int, topn: int) -> str | None:
|
| 53 |
+
# LLM-flagged clarification
|
| 54 |
+
if plan.needs_clarification and plan.clarifying_question:
|
| 55 |
+
return plan.clarifying_question
|
| 56 |
+
# Coverage-based triggers
|
| 57 |
+
if cand_count < max(10, int(0.25 * topn)):
|
| 58 |
+
return "Results look thin. Clarify: are you looking for (1) personality/culture fit, (2) leadership judgment (SJT), or (3) role capability?"
|
| 59 |
+
if plan.intent in {"BEHAVIORAL", "UNKNOWN", "MIXED"} and cand_count < max(20, int(0.5 * topn)):
|
| 60 |
+
return "For culture/behavioral focus, choose: (1) personality/culture fit, (2) leadership judgment (SJT), or (3) role capability. Please pick one."
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def run_chat(
|
| 65 |
+
user_text: str,
|
| 66 |
+
vocab_path: str = "data/catalog_role_vocab.json",
|
| 67 |
+
model_name: str = "gemini-2.5-flash-lite",
|
| 68 |
+
clarification_answer: str | None = None,
|
| 69 |
+
topn: int = 200,
|
| 70 |
+
verbose: bool = False,
|
| 71 |
+
):
|
| 72 |
+
vocab = json.load(open(vocab_path)) if vocab_path and os.path.exists(vocab_path) else {}
|
| 73 |
+
df_catalog, bm25, vec, catalog_by_id = load_resources()
|
| 74 |
+
catalog_lookup = make_catalog_lookup(df_catalog)
|
| 75 |
+
|
| 76 |
+
trace_id = f"trace-{abs(hash(user_text))}"
|
| 77 |
+
log = {"trace_id": trace_id, "raw_query": user_text}
|
| 78 |
+
|
| 79 |
+
# Plan with LLM; fallback deterministic if LLM fails
|
| 80 |
+
try:
|
| 81 |
+
plan = build_query_plan_llm(user_text, vocab=vocab, model_name=model_name)
|
| 82 |
+
QueryPlan.model_validate(plan.dict()) # schema guard
|
| 83 |
+
log["plan_source"] = "llm"
|
| 84 |
+
except Exception as e:
|
| 85 |
+
plan = deterministic_plan(user_text, vocab=vocab)
|
| 86 |
+
log["plan_source"] = f"deterministic (llm_fail={str(e)})"
|
| 87 |
+
log["query_plan"] = plan.dict()
|
| 88 |
+
|
| 89 |
+
# Retrieve union
|
| 90 |
+
cand_set = retrieve_candidates(plan, bm25, vec, topn=topn, catalog_df=df_catalog)
|
| 91 |
+
if verbose:
|
| 92 |
+
log["candidates"] = [c.model_dump() for c in cand_set.candidates[:10]]
|
| 93 |
+
|
| 94 |
+
# Clarification loop
|
| 95 |
+
question = _maybe_clarify(plan, cand_count=len(cand_set.candidates), topn=topn)
|
| 96 |
+
if question and not clarification_answer:
|
| 97 |
+
log["clarification"] = question
|
| 98 |
+
if verbose:
|
| 99 |
+
print(json.dumps(log, indent=2))
|
| 100 |
+
return f"Clarification needed: {question}"
|
| 101 |
+
if question and clarification_answer:
|
| 102 |
+
clarified_text = f"{user_text}\nUser clarification: {clarification_answer}"
|
| 103 |
+
try:
|
| 104 |
+
plan = build_query_plan_llm(clarified_text, vocab=vocab, model_name=model_name)
|
| 105 |
+
QueryPlan.model_validate(plan.dict())
|
| 106 |
+
except Exception:
|
| 107 |
+
plan = deterministic_plan(clarified_text, vocab=vocab)
|
| 108 |
+
log["query_plan_clarified"] = plan.dict()
|
| 109 |
+
cand_set = retrieve_candidates(plan, bm25, vec, topn=topn, catalog_df=df_catalog)
|
| 110 |
+
if verbose:
|
| 111 |
+
log["candidates_clarified"] = [c.model_dump() for c in cand_set.candidates[:10]]
|
| 112 |
+
|
| 113 |
+
# Rerank
|
| 114 |
+
reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
|
| 115 |
+
ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
|
| 116 |
+
log["rerank"] = [item.model_dump() for item in ranked.items]
|
| 117 |
+
|
| 118 |
+
# Constraints
|
| 119 |
+
final_list = apply_constraints(plan, ranked, catalog_by_id, k=10)
|
| 120 |
+
log["final"] = [item.model_dump() for item in final_list.items]
|
| 121 |
+
|
| 122 |
+
# Explain
|
| 123 |
+
summary = explain(plan, final_list, catalog_lookup)
|
| 124 |
+
log["summary"] = summary
|
| 125 |
+
|
| 126 |
+
# Compact output: top-10 with metadata
|
| 127 |
+
final_results = []
|
| 128 |
+
for item in final_list.items:
|
| 129 |
+
meta = catalog_lookup(item.assessment_id)
|
| 130 |
+
final_results.append(
|
| 131 |
+
{
|
| 132 |
+
"assessment_id": item.assessment_id,
|
| 133 |
+
"score": item.score,
|
| 134 |
+
"name": meta.get("name"),
|
| 135 |
+
"url": meta.get("url"),
|
| 136 |
+
"test_type_full": meta.get("test_type_full") or meta.get("test_type"),
|
| 137 |
+
"duration": meta.get("duration_minutes") or meta.get("duration"),
|
| 138 |
+
}
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
if verbose:
|
| 142 |
+
log["final_results"] = final_results
|
| 143 |
+
print(json.dumps(log, indent=2))
|
| 144 |
+
else:
|
| 145 |
+
print(json.dumps({"trace_id": trace_id, "final_results": final_results}, indent=2))
|
| 146 |
+
|
| 147 |
+
return summary
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
if __name__ == "__main__":
|
| 151 |
+
import sys
|
| 152 |
+
|
| 153 |
+
if "GOOGLE_API_KEY" not in os.environ:
|
| 154 |
+
print("Please set GOOGLE_API_KEY for Gemini.")
|
| 155 |
+
user_text = " ".join(sys.argv[1:]) or "Find a 1 hour culture fit assessment for a COO"
|
| 156 |
+
print(run_chat(user_text, verbose=False))
|
agent/router_agent.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Router-style agent (minimal, deterministic) that orchestrates the tool stack:
|
| 5 |
+
- build_query_plan
|
| 6 |
+
- retrieve_candidates
|
| 7 |
+
- rerank_candidates
|
| 8 |
+
- apply_constraints
|
| 9 |
+
- explain
|
| 10 |
+
|
| 11 |
+
This is intentionally simple and does not require an LLM. You can swap
|
| 12 |
+
build_query_plan with an LLM-based planner that emits the same QueryPlan schema.
|
| 13 |
+
"""
|
| 14 |
+
import json
|
| 15 |
+
from typing import Callable
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
+
from data.catalog_loader import load_catalog
|
| 20 |
+
from recommenders.bm25 import BM25Recommender
|
| 21 |
+
from recommenders.vector_recommender import VectorRecommender
|
| 22 |
+
from retrieval.vector_index import VectorIndex
|
| 23 |
+
from models.embedding_model import EmbeddingModel
|
| 24 |
+
from rerankers.cross_encoder import CrossEncoderReranker
|
| 25 |
+
|
| 26 |
+
from tools.query_plan_tool import build_query_plan
|
| 27 |
+
from tools.retrieve_tool import retrieve_candidates
|
| 28 |
+
from tools.rerank_tool import rerank_candidates
|
| 29 |
+
from tools.constraints_tool import apply_constraints
|
| 30 |
+
from tools.explain_tool import explain
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def load_resources():
|
| 34 |
+
df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
|
| 35 |
+
bm25 = BM25Recommender(df_catalog)
|
| 36 |
+
embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
|
| 37 |
+
index = VectorIndex.load("data/faiss_index/index_bge.faiss")
|
| 38 |
+
with open("data/embeddings_bge/assessment_ids.json") as f:
|
| 39 |
+
ids = json.load(f)
|
| 40 |
+
vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
|
| 41 |
+
return df_catalog, bm25, vec
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def make_catalog_lookup(df_catalog: pd.DataFrame) -> Callable[[str], dict]:
|
| 45 |
+
cat = df_catalog.set_index("assessment_id")
|
| 46 |
+
|
| 47 |
+
def lookup(aid: str) -> dict:
|
| 48 |
+
if aid in cat.index:
|
| 49 |
+
return cat.loc[aid].to_dict()
|
| 50 |
+
return {}
|
| 51 |
+
|
| 52 |
+
return lookup
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def route_query(user_text: str, vocab_path: str = "data/catalog_role_vocab.json") -> str:
|
| 56 |
+
vocab = json.load(open(vocab_path)) if vocab_path else {}
|
| 57 |
+
df_catalog, bm25, vec = load_resources()
|
| 58 |
+
catalog_lookup = make_catalog_lookup(df_catalog)
|
| 59 |
+
|
| 60 |
+
# 1) Plan (deterministic rewriter; swap with LLM-structured plan if desired)
|
| 61 |
+
plan = build_query_plan(user_text, vocab=vocab)
|
| 62 |
+
|
| 63 |
+
# 2) Clarification hook
|
| 64 |
+
# Placeholder: in an interactive app, if plan.needs_clarification or coverage is weak,
|
| 65 |
+
# ask a question and rebuild the plan with the user response.
|
| 66 |
+
|
| 67 |
+
# 3) Retrieve (union of BM25 + vector)
|
| 68 |
+
cand_set = retrieve_candidates(plan, bm25, vec, topn=200, catalog_df=df_catalog)
|
| 69 |
+
|
| 70 |
+
# 4) Rerank (cross-encoder)
|
| 71 |
+
reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
|
| 72 |
+
ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
|
| 73 |
+
|
| 74 |
+
# 5) Apply constraints (stub; extend for duration/remote/adaptive)
|
| 75 |
+
final_list = apply_constraints(plan, ranked)
|
| 76 |
+
|
| 77 |
+
# 6) Explain
|
| 78 |
+
summary = explain(plan, final_list, catalog_lookup)
|
| 79 |
+
return summary
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
import sys
|
| 84 |
+
|
| 85 |
+
user_text = " ".join(sys.argv[1:]) or "Find a 1 hour culture fit assessment for a COO"
|
| 86 |
+
print(route_query(user_text))
|
agent/server.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Minimal chat backend (FastAPI) that delegates to the agent app pipeline.
|
| 5 |
+
|
| 6 |
+
Run:
|
| 7 |
+
uvicorn agent.server:app --reload --port 8000
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import uuid
|
| 11 |
+
import json
|
| 12 |
+
from typing import Optional, Callable
|
| 13 |
+
from collections import deque
|
| 14 |
+
import time
|
| 15 |
+
import math
|
| 16 |
+
|
| 17 |
+
from fastapi import FastAPI
|
| 18 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
+
from fastapi.responses import FileResponse
|
| 20 |
+
from fastapi.staticfiles import StaticFiles
|
| 21 |
+
from pydantic import BaseModel
|
| 22 |
+
|
| 23 |
+
from functools import lru_cache
|
| 24 |
+
import os
|
| 25 |
+
from data.catalog_loader import load_catalog
|
| 26 |
+
from recommenders.bm25 import BM25Recommender
|
| 27 |
+
from recommenders.vector_recommender import VectorRecommender
|
| 28 |
+
from retrieval.vector_index import VectorIndex
|
| 29 |
+
from models.embedding_model import EmbeddingModel
|
| 30 |
+
from rerankers.cross_encoder import CrossEncoderReranker
|
| 31 |
+
from tools.query_plan_tool import build_query_plan
|
| 32 |
+
from tools.query_plan_tool_llm import build_query_plan_llm
|
| 33 |
+
from llm.nu_extract import NuExtractWrapper, default_query_rewrite_examples
|
| 34 |
+
from llm.qwen_rewriter import QwenRewriter
|
| 35 |
+
from tools.retrieve_tool import retrieve_candidates
|
| 36 |
+
from tools.rerank_tool import rerank_candidates
|
| 37 |
+
from tools.constraints_tool import apply_constraints
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class ChatRequest(BaseModel):
|
| 41 |
+
query: str
|
| 42 |
+
clarification_answer: Optional[str] = None
|
| 43 |
+
verbose: bool = False
|
| 44 |
+
|
| 45 |
+
class RecommendRequest(BaseModel):
|
| 46 |
+
query: str
|
| 47 |
+
llm_model: Optional[str] = None
|
| 48 |
+
verbose: bool = False
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _make_catalog_lookup(df_catalog) -> Callable[[str], dict]:
|
| 52 |
+
cat = df_catalog.set_index("assessment_id")
|
| 53 |
+
def lookup(aid: str) -> dict:
|
| 54 |
+
if aid in cat.index:
|
| 55 |
+
return cat.loc[aid].to_dict()
|
| 56 |
+
return {}
|
| 57 |
+
return lookup
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@lru_cache(maxsize=1)
|
| 61 |
+
def load_resources(llm_model_override: Optional[str] = None):
|
| 62 |
+
df_catalog, _, _ = load_catalog("data/catalog_docs_rich.jsonl")
|
| 63 |
+
bm25 = BM25Recommender(df_catalog)
|
| 64 |
+
embed = EmbeddingModel("BAAI/bge-small-en-v1.5")
|
| 65 |
+
index = VectorIndex.load("data/faiss_index/index_bge.faiss")
|
| 66 |
+
with open("data/embeddings_bge/assessment_ids.json") as f:
|
| 67 |
+
ids = json.load(f)
|
| 68 |
+
vec = VectorRecommender(embed, index, df_catalog, ids, k_candidates=200)
|
| 69 |
+
reranker = CrossEncoderReranker(model_name="models/reranker_crossenc/v0.1.0")
|
| 70 |
+
lookup = _make_catalog_lookup(df_catalog)
|
| 71 |
+
catalog_by_id = {row["assessment_id"]: row for _, row in df_catalog.iterrows()}
|
| 72 |
+
vocab = {}
|
| 73 |
+
vocab_path = "data/catalog_role_vocab.json"
|
| 74 |
+
if os.path.exists(vocab_path):
|
| 75 |
+
try:
|
| 76 |
+
with open(vocab_path) as vf:
|
| 77 |
+
vocab = json.load(vf)
|
| 78 |
+
except Exception:
|
| 79 |
+
vocab = {}
|
| 80 |
+
# Optional LLM rewriter; choose via request override or env LLM_MODEL
|
| 81 |
+
llm_extractor = None
|
| 82 |
+
llm_model = llm_model_override or os.getenv("LLM_MODEL", "").strip()
|
| 83 |
+
if not llm_model:
|
| 84 |
+
llm_model = "Qwen/Qwen2.5-1.5B-Instruct"
|
| 85 |
+
try:
|
| 86 |
+
if llm_model.lower().startswith("qwen"):
|
| 87 |
+
llm_extractor = QwenRewriter(model_name=llm_model, default_examples=default_query_rewrite_examples())
|
| 88 |
+
elif not os.getenv("GOOGLE_API_KEY"):
|
| 89 |
+
llm_extractor = NuExtractWrapper(default_examples=default_query_rewrite_examples())
|
| 90 |
+
except Exception:
|
| 91 |
+
llm_extractor = None
|
| 92 |
+
return df_catalog, bm25, vec, reranker, lookup, vocab, llm_extractor, catalog_by_id
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _infer_remote_adaptive(meta: dict) -> (Optional[bool], Optional[bool]):
|
| 96 |
+
remote = meta.get("remote_support", True if meta.get("remote_support") is None else meta.get("remote_support"))
|
| 97 |
+
adaptive = meta.get("adaptive_support")
|
| 98 |
+
text_blob = " ".join([str(meta.get("name", "")), str(meta.get("description", "")), str(meta.get("doc_text", ""))]).lower()
|
| 99 |
+
if adaptive is None and "adaptive" in text_blob:
|
| 100 |
+
adaptive = True
|
| 101 |
+
return remote, adaptive
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _build_plan_with_fallback(query: str, vocab: dict, llm_extractor):
|
| 105 |
+
"""
|
| 106 |
+
Build the query plan using the LLM rewriter (Qwen) when available, otherwise
|
| 107 |
+
fall back to deterministic rewrite. No Gemini refinement to keep behavior predictable.
|
| 108 |
+
"""
|
| 109 |
+
try:
|
| 110 |
+
return build_query_plan(query, vocab=vocab, llm_extractor=llm_extractor)
|
| 111 |
+
except Exception:
|
| 112 |
+
return build_query_plan(query, vocab=vocab)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _safe_num(val):
|
| 116 |
+
try:
|
| 117 |
+
if val is None:
|
| 118 |
+
return None
|
| 119 |
+
f = float(val)
|
| 120 |
+
if math.isfinite(f):
|
| 121 |
+
return f
|
| 122 |
+
except Exception:
|
| 123 |
+
return None
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _sanitize_debug(obj):
|
| 128 |
+
"""Recursively replace NaN/inf with None to keep JSON safe."""
|
| 129 |
+
if isinstance(obj, dict):
|
| 130 |
+
return {k: _sanitize_debug(v) for k, v in obj.items()}
|
| 131 |
+
if isinstance(obj, list):
|
| 132 |
+
return [_sanitize_debug(v) for v in obj]
|
| 133 |
+
if isinstance(obj, tuple):
|
| 134 |
+
return tuple(_sanitize_debug(v) for v in obj)
|
| 135 |
+
if isinstance(obj, (int, float)):
|
| 136 |
+
return _safe_num(obj)
|
| 137 |
+
return obj
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
CODE_TO_FULL = {
|
| 141 |
+
"A": "Ability & Aptitude",
|
| 142 |
+
"B": "Biodata & Situational Judgement",
|
| 143 |
+
"C": "Competencies",
|
| 144 |
+
"D": "Development & 360",
|
| 145 |
+
"E": "Assessment Exercises",
|
| 146 |
+
"K": "Knowledge & Skills",
|
| 147 |
+
"P": "Personality & Behavior",
|
| 148 |
+
"S": "Simulations",
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _format_test_types(meta: dict) -> list[str]:
|
| 153 |
+
if meta.get("test_type_full"):
|
| 154 |
+
raw = meta["test_type_full"]
|
| 155 |
+
elif meta.get("test_type"):
|
| 156 |
+
raw = meta["test_type"]
|
| 157 |
+
else:
|
| 158 |
+
return []
|
| 159 |
+
if isinstance(raw, list):
|
| 160 |
+
vals = raw
|
| 161 |
+
else:
|
| 162 |
+
vals = str(raw).replace("/", ",").split(",")
|
| 163 |
+
out = []
|
| 164 |
+
for v in vals:
|
| 165 |
+
v = v.strip()
|
| 166 |
+
if not v:
|
| 167 |
+
continue
|
| 168 |
+
# Map letter codes to full names when applicable
|
| 169 |
+
if len(v) == 1 and v in CODE_TO_FULL:
|
| 170 |
+
out.append(CODE_TO_FULL[v])
|
| 171 |
+
else:
|
| 172 |
+
out.append(v)
|
| 173 |
+
return out
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def _run_pipeline(query: str, topn: int = 200, verbose: bool = False, llm_model: Optional[str] = None):
|
| 177 |
+
if verbose:
|
| 178 |
+
# For debugging, bypass cached resources to ensure fresh state
|
| 179 |
+
load_resources.cache_clear()
|
| 180 |
+
df_catalog, bm25, vec, reranker, lookup, vocab, llm_extractor, catalog_by_id = load_resources(llm_model_override=llm_model)
|
| 181 |
+
plan = _build_plan_with_fallback(query, vocab=vocab, llm_extractor=llm_extractor)
|
| 182 |
+
cand_set = retrieve_candidates(plan, bm25, vec, topn=topn, catalog_df=df_catalog)
|
| 183 |
+
ranked = rerank_candidates(plan, cand_set, reranker, df_catalog, k=10)
|
| 184 |
+
final_list = apply_constraints(plan, ranked, catalog_by_id, k=10)
|
| 185 |
+
|
| 186 |
+
debug_payload = {}
|
| 187 |
+
if verbose:
|
| 188 |
+
debug_payload["plan"] = plan.dict()
|
| 189 |
+
# If plan carries a source (from planner), include it
|
| 190 |
+
if hasattr(plan, "plan_source"):
|
| 191 |
+
debug_payload["plan_source"] = getattr(plan, "plan_source")
|
| 192 |
+
# Capture NuExtract LLM debug if present
|
| 193 |
+
if hasattr(plan, "llm_debug") and plan.llm_debug:
|
| 194 |
+
debug_payload["llm_debug"] = plan.llm_debug
|
| 195 |
+
if hasattr(cand_set, "fusion") and cand_set.fusion:
|
| 196 |
+
debug_payload["fusion"] = cand_set.fusion
|
| 197 |
+
debug_payload["candidates"] = [
|
| 198 |
+
{
|
| 199 |
+
"assessment_id": c.assessment_id,
|
| 200 |
+
"bm25_rank": c.bm25_rank,
|
| 201 |
+
"vector_rank": c.vector_rank,
|
| 202 |
+
"hybrid_rank": c.hybrid_rank,
|
| 203 |
+
"bm25_score": _safe_num(c.bm25_score),
|
| 204 |
+
"vector_score": _safe_num(c.vector_score),
|
| 205 |
+
"score": _safe_num(c.score),
|
| 206 |
+
}
|
| 207 |
+
for c in cand_set.candidates[: min(20, len(cand_set.candidates))]
|
| 208 |
+
]
|
| 209 |
+
debug_payload["rerank"] = [
|
| 210 |
+
{"assessment_id": r.assessment_id, "score": _safe_num(r.score)}
|
| 211 |
+
for r in ranked.items[: min(20, len(ranked.items))]
|
| 212 |
+
]
|
| 213 |
+
debug_payload["constraints"] = [
|
| 214 |
+
{
|
| 215 |
+
"assessment_id": r.assessment_id,
|
| 216 |
+
"score": _safe_num(r.score),
|
| 217 |
+
"debug": r.debug,
|
| 218 |
+
}
|
| 219 |
+
for r in final_list.items
|
| 220 |
+
]
|
| 221 |
+
|
| 222 |
+
final_results = []
|
| 223 |
+
for item in final_list.items:
|
| 224 |
+
meta = lookup(item.assessment_id)
|
| 225 |
+
remote, adaptive = _infer_remote_adaptive(meta)
|
| 226 |
+
score = _safe_num(item.score)
|
| 227 |
+
duration = _safe_num(meta.get("duration_minutes") or meta.get("duration"))
|
| 228 |
+
duration_int = int(duration) if duration is not None else None
|
| 229 |
+
description = meta.get("description") or meta.get("doc_text") or ""
|
| 230 |
+
test_types = _format_test_types(meta)
|
| 231 |
+
final_results.append(
|
| 232 |
+
{
|
| 233 |
+
"url": meta.get("url"),
|
| 234 |
+
"name": meta.get("name"),
|
| 235 |
+
"adaptive_support": "Yes" if adaptive else "No",
|
| 236 |
+
"description": description,
|
| 237 |
+
"duration": duration_int if duration_int is not None else 0,
|
| 238 |
+
"remote_support": "Yes" if remote else "No",
|
| 239 |
+
"test_type": test_types,
|
| 240 |
+
}
|
| 241 |
+
)
|
| 242 |
+
# Guarantee at least one result if pipeline produced candidates
|
| 243 |
+
if not final_results and ranked.items:
|
| 244 |
+
item = ranked.items[0]
|
| 245 |
+
meta = lookup(item.assessment_id)
|
| 246 |
+
remote, adaptive = _infer_remote_adaptive(meta)
|
| 247 |
+
duration = _safe_num(meta.get("duration_minutes") or meta.get("duration"))
|
| 248 |
+
duration_int = int(duration) if duration is not None else 0
|
| 249 |
+
final_results.append(
|
| 250 |
+
{
|
| 251 |
+
"url": meta.get("url"),
|
| 252 |
+
"name": meta.get("name"),
|
| 253 |
+
"adaptive_support": "Yes" if adaptive else "No",
|
| 254 |
+
"description": meta.get("description") or meta.get("doc_text") or "",
|
| 255 |
+
"duration": duration_int,
|
| 256 |
+
"remote_support": "Yes" if remote else "No",
|
| 257 |
+
"test_type": _format_test_types(meta),
|
| 258 |
+
}
|
| 259 |
+
)
|
| 260 |
+
summary = {"plan": plan.intent, "top": len(final_results)}
|
| 261 |
+
return final_results, summary, debug_payload
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
app = FastAPI()
|
| 265 |
+
app.add_middleware(
|
| 266 |
+
CORSMiddleware,
|
| 267 |
+
allow_origins=["*"],
|
| 268 |
+
allow_credentials=False, # '*' cannot be used with credentials
|
| 269 |
+
allow_methods=["*"],
|
| 270 |
+
allow_headers=["*"],
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Serve frontend assets
|
| 274 |
+
app.mount("/static", StaticFiles(directory="frontend"), name="static")
|
| 275 |
+
|
| 276 |
+
# Simple in-process rate limiter (max 5 requests per second)
|
| 277 |
+
_timestamps = deque()
|
| 278 |
+
_RATE_LIMIT = 5
|
| 279 |
+
_WINDOW = 1.0
|
| 280 |
+
|
| 281 |
+
def _allow_request() -> bool:
|
| 282 |
+
now = time.time()
|
| 283 |
+
while _timestamps and now - _timestamps[0] > _WINDOW:
|
| 284 |
+
_timestamps.popleft()
|
| 285 |
+
if len(_timestamps) < _RATE_LIMIT:
|
| 286 |
+
_timestamps.append(now)
|
| 287 |
+
return True
|
| 288 |
+
return False
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
@app.post("/chat")
|
| 292 |
+
def chat(req: ChatRequest):
|
| 293 |
+
if not _allow_request():
|
| 294 |
+
return {"error": "rate limit exceeded"}
|
| 295 |
+
trace_id = str(uuid.uuid4())
|
| 296 |
+
final_results, summary, debug_payload = _run_pipeline(req.query, verbose=req.verbose)
|
| 297 |
+
payload = {"trace_id": trace_id, "final_results": final_results}
|
| 298 |
+
if req.verbose:
|
| 299 |
+
payload["summary"] = summary
|
| 300 |
+
payload["debug"] = _sanitize_debug(debug_payload)
|
| 301 |
+
return payload
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
@app.post("/recommend")
|
| 305 |
+
def recommend(req: RecommendRequest):
|
| 306 |
+
if not _allow_request():
|
| 307 |
+
return {"error": "rate limit exceeded"}
|
| 308 |
+
final_results, summary, debug_payload = _run_pipeline(req.query, verbose=req.verbose, llm_model=req.llm_model)
|
| 309 |
+
resp = {"recommended_assessments": final_results}
|
| 310 |
+
if req.verbose:
|
| 311 |
+
resp["debug"] = _sanitize_debug(debug_payload)
|
| 312 |
+
resp["summary"] = summary
|
| 313 |
+
return resp
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
@app.get("/health")
|
| 317 |
+
def health():
|
| 318 |
+
return {"status": "ok"}
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
@app.get("/")
|
| 322 |
+
def index():
|
| 323 |
+
# Serve the SPA entry point
|
| 324 |
+
return FileResponse("frontend/index.html")
|
api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""API surface (REST/gRPC/WebSocket) for serving recommendations."""
|
config.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Centralized config loader using YAML with ENV overrides.
|
| 2 |
+
|
| 3 |
+
Environment variables prefixed with ``LRE_`` can override nested keys using
|
| 4 |
+
double-underscores, e.g. ``LRE_APP__LOG_LEVEL=DEBUG``.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import argparse
|
| 10 |
+
import os
|
| 11 |
+
import pathlib
|
| 12 |
+
from typing import Any, Dict
|
| 13 |
+
|
| 14 |
+
import yaml
|
| 15 |
+
|
| 16 |
+
DEFAULT_CONFIG_PATH = pathlib.Path(os.environ.get("CONFIG_PATH", "configs/config.yaml"))
|
| 17 |
+
ENV_PREFIX = "LRE_"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _parse_env_value(value: str) -> Any:
|
| 21 |
+
"""Best-effort casting for env values."""
|
| 22 |
+
lowered = value.lower()
|
| 23 |
+
if lowered in {"true", "false"}:
|
| 24 |
+
return lowered == "true"
|
| 25 |
+
try:
|
| 26 |
+
return int(value)
|
| 27 |
+
except ValueError:
|
| 28 |
+
pass
|
| 29 |
+
try:
|
| 30 |
+
return float(value)
|
| 31 |
+
except ValueError:
|
| 32 |
+
pass
|
| 33 |
+
return value
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _set_nested(config: Dict[str, Any], path: list[str], value: Any) -> None:
|
| 37 |
+
cursor = config
|
| 38 |
+
for part in path[:-1]:
|
| 39 |
+
cursor = cursor.setdefault(part, {})
|
| 40 |
+
cursor[path[-1]] = value
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def apply_env_overrides(config: Dict[str, Any], prefix: str = ENV_PREFIX) -> Dict[str, Any]:
|
| 44 |
+
"""Apply ENV overrides in-place and return config."""
|
| 45 |
+
for key, raw_value in os.environ.items():
|
| 46 |
+
if not key.startswith(prefix):
|
| 47 |
+
continue
|
| 48 |
+
path = key[len(prefix) :].lower().split("__")
|
| 49 |
+
_set_nested(config, path, _parse_env_value(raw_value))
|
| 50 |
+
return config
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def load_config(config_path: pathlib.Path | str | None = None) -> Dict[str, Any]:
|
| 54 |
+
"""Load YAML config and apply ENV overrides."""
|
| 55 |
+
path = pathlib.Path(config_path or DEFAULT_CONFIG_PATH)
|
| 56 |
+
with path.open() as f:
|
| 57 |
+
config: Dict[str, Any] = yaml.safe_load(f) or {}
|
| 58 |
+
return apply_env_overrides(config)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _cli() -> None:
|
| 62 |
+
parser = argparse.ArgumentParser(description="Config loader helper")
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--print", dest="print_config", action="store_true", help="Print resolved config"
|
| 65 |
+
)
|
| 66 |
+
parser.add_argument("--path", dest="config_path", type=str, help="Optional config path")
|
| 67 |
+
args = parser.parse_args()
|
| 68 |
+
|
| 69 |
+
cfg = load_config(args.config_path)
|
| 70 |
+
if args.print_config:
|
| 71 |
+
print(yaml.dump(cfg, sort_keys=False))
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
_cli()
|
configs/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Config package placeholder."""
|
configs/config.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
app:
|
| 2 |
+
name: llm-recommendation-engine
|
| 3 |
+
environment: local
|
| 4 |
+
log_level: INFO
|
| 5 |
+
|
| 6 |
+
data:
|
| 7 |
+
raw_dir: data/raw
|
| 8 |
+
processed_dir: data/processed
|
| 9 |
+
cache_dir: data/cache
|
| 10 |
+
|
| 11 |
+
index:
|
| 12 |
+
type: faiss
|
| 13 |
+
dim: 384
|
| 14 |
+
store_path: data/index/faiss.index
|
| 15 |
+
|
| 16 |
+
models:
|
| 17 |
+
embedder: sentence-transformers/all-MiniLM-L6-v2
|
| 18 |
+
reranker: cross-encoder/ms-marco-MiniLM-L-6-v2
|
| 19 |
+
|
| 20 |
+
services:
|
| 21 |
+
api:
|
| 22 |
+
host: 0.0.0.0
|
| 23 |
+
port: 8000
|
| 24 |
+
ui:
|
| 25 |
+
host: 0.0.0.0
|
| 26 |
+
port: 3000
|
| 27 |
+
|
| 28 |
+
observability:
|
| 29 |
+
tracing_enabled: false
|
| 30 |
+
metrics_endpoint: /metrics
|
| 31 |
+
|
| 32 |
+
storage:
|
| 33 |
+
bucket: s3://placeholder-bucket
|
| 34 |
+
prefix: recommendations
|
| 35 |
+
|
| 36 |
+
crawler:
|
| 37 |
+
start_url: https://www.shl.com/products/product-catalog/
|
| 38 |
+
user_agent: llm-recommendation-engine/0.1 (+https://example.com)
|
| 39 |
+
max_concurrency: 2
|
| 40 |
+
request_delay_seconds: 1.5
|
| 41 |
+
jitter_seconds: 0.5
|
| 42 |
+
max_retries: 3
|
| 43 |
+
sqlite_path: data/crawler.db
|
configs/embedding_config.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
name: sentence-transformers/all-MiniLM-L6-v2
|
| 3 |
+
cache_dir: .model_cache
|
| 4 |
+
device: cpu
|
| 5 |
+
normalize_embeddings: true
|
| 6 |
+
batch_size: 32
|
| 7 |
+
|
| 8 |
+
preprocessing:
|
| 9 |
+
max_length: 512
|
| 10 |
+
padding: false
|
configs/retrieval.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
retrieval:
|
| 2 |
+
method: hybrid_rrf
|
| 3 |
+
train_topn_candidates: 200
|
| 4 |
+
infer_topn_candidates: 100
|
| 5 |
+
rrf_k: 60
|
crawler/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Data ingestion and crawling utilities."""
|
crawler/backfill_labels.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import asyncio
|
| 5 |
+
import csv
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import structlog
|
| 10 |
+
|
| 11 |
+
from config import load_config
|
| 12 |
+
from crawler.fetcher import PlaywrightFetcher
|
| 13 |
+
from crawler.parser_detail import parse_detail_page
|
| 14 |
+
from crawler.robots import RobotsManager
|
| 15 |
+
from crawler.storage import PAGE_TYPE_DETAIL, PARSE_PARSED, PageRecord, Storage
|
| 16 |
+
from crawler.utils import RateLimiter
|
| 17 |
+
|
| 18 |
+
logger = structlog.get_logger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
async def backfill_from_probe(probe_csv: str, storage: Storage, fetcher: PlaywrightFetcher, robots: RobotsManager, allow_bypass: bool):
|
| 22 |
+
with open(probe_csv) as f:
|
| 23 |
+
reader = csv.DictReader(f)
|
| 24 |
+
rows = [row for row in reader if row.get("classification") == "DETAIL_PAGE_VALID"]
|
| 25 |
+
logger.info("backfill.labels.start", count=len(rows))
|
| 26 |
+
for row in rows:
|
| 27 |
+
url = row["url"]
|
| 28 |
+
allowed = allow_bypass or robots.is_allowed(url)
|
| 29 |
+
if not allowed:
|
| 30 |
+
logger.warning("backfill.detail.disallowed", url=url)
|
| 31 |
+
continue
|
| 32 |
+
if allow_bypass:
|
| 33 |
+
logger.warning("backfill.detail.disallowed.bypassed", url=url)
|
| 34 |
+
result = await fetcher.fetch(url, page_type=PAGE_TYPE_DETAIL)
|
| 35 |
+
storage.upsert_page(result.record)
|
| 36 |
+
if result.error or not result.html:
|
| 37 |
+
logger.error("backfill.detail.fetch_failed", url=url, error=result.error)
|
| 38 |
+
continue
|
| 39 |
+
parse_detail_page(result.html, url=url, storage=storage)
|
| 40 |
+
storage.update_parse_status(url, PARSE_PARSED)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
parser = argparse.ArgumentParser(description="Backfill assessments from probed label URLs")
|
| 45 |
+
parser.add_argument("--probe-csv", required=True, help="CSV from scripts/probe_unmatched_labels.py")
|
| 46 |
+
parser.add_argument("--config", type=str, default=os.environ.get("CONFIG_PATH", "configs/config.yaml"))
|
| 47 |
+
parser.add_argument("--sqlite", type=str, default="data/crawler.db")
|
| 48 |
+
parser.add_argument("--allow-robots-bypass", action="store_true", help="Bypass robots.txt disallow (use responsibly)")
|
| 49 |
+
args = parser.parse_args()
|
| 50 |
+
|
| 51 |
+
config = load_config(args.config)
|
| 52 |
+
rate_limiter = RateLimiter(
|
| 53 |
+
base_delay=float(os.environ.get("REQUEST_DELAY_SECONDS", config.get("crawler", {}).get("request_delay_seconds", 1.5))),
|
| 54 |
+
jitter=float(os.environ.get("JITTER_SECONDS", config.get("crawler", {}).get("jitter_seconds", 0.5))),
|
| 55 |
+
)
|
| 56 |
+
user_agent = os.environ.get("USER_AGENT", config.get("crawler", {}).get("user_agent"))
|
| 57 |
+
max_retries = int(os.environ.get("MAX_RETRIES", config.get("crawler", {}).get("max_retries", 3)))
|
| 58 |
+
|
| 59 |
+
storage = Storage(args.sqlite)
|
| 60 |
+
robots = RobotsManager(robots_url="https://www.shl.com/robots.txt", user_agent=user_agent)
|
| 61 |
+
robots.load()
|
| 62 |
+
|
| 63 |
+
async def _runner():
|
| 64 |
+
async with PlaywrightFetcher(user_agent=user_agent, rate_limiter=rate_limiter, max_retries=max_retries) as fetcher:
|
| 65 |
+
await backfill_from_probe(args.probe_csv, storage, fetcher, robots, allow_bypass=args.allow_robots_bypass)
|
| 66 |
+
|
| 67 |
+
asyncio.run(_runner())
|
| 68 |
+
logger.info("backfill.labels.done")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
crawler/export.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import structlog
|
| 9 |
+
|
| 10 |
+
from crawler.storage import Storage
|
| 11 |
+
from crawler.utils import now_iso
|
| 12 |
+
|
| 13 |
+
logger = structlog.get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _normalize_row(row) -> dict:
|
| 17 |
+
downloads = row["downloads"]
|
| 18 |
+
if isinstance(downloads, str):
|
| 19 |
+
try:
|
| 20 |
+
downloads = json.loads(downloads)
|
| 21 |
+
except Exception:
|
| 22 |
+
downloads = None
|
| 23 |
+
job_levels = row["job_levels"]
|
| 24 |
+
if isinstance(job_levels, str):
|
| 25 |
+
try:
|
| 26 |
+
job_levels = json.loads(job_levels)
|
| 27 |
+
except Exception:
|
| 28 |
+
job_levels = [j.strip() for j in job_levels.split(",") if j.strip()]
|
| 29 |
+
languages = row.get("languages")
|
| 30 |
+
if isinstance(languages, str):
|
| 31 |
+
try:
|
| 32 |
+
languages = json.loads(languages)
|
| 33 |
+
except Exception:
|
| 34 |
+
languages = [l.strip() for l in languages.split(",") if l.strip()]
|
| 35 |
+
duration_minutes = row["duration_minutes"]
|
| 36 |
+
duration_hours = None
|
| 37 |
+
if duration_minutes is not None:
|
| 38 |
+
try:
|
| 39 |
+
duration_hours = float(duration_minutes) / 60.0
|
| 40 |
+
except Exception:
|
| 41 |
+
duration_hours = None
|
| 42 |
+
return {
|
| 43 |
+
"url": row["url"],
|
| 44 |
+
"name": row["name"],
|
| 45 |
+
"description": row["description"],
|
| 46 |
+
"test_type": row["test_type"],
|
| 47 |
+
"test_type_full": row.get("test_type_full"),
|
| 48 |
+
"remote_support": bool(row["remote_support"]) if row["remote_support"] is not None else None,
|
| 49 |
+
"adaptive_support": bool(row["adaptive_support"]) if row["adaptive_support"] is not None else None,
|
| 50 |
+
"duration": duration_minutes,
|
| 51 |
+
"duration_hours": duration_hours,
|
| 52 |
+
"job_levels": job_levels,
|
| 53 |
+
"languages": languages,
|
| 54 |
+
"downloads": downloads,
|
| 55 |
+
"source": "shl_product_catalog",
|
| 56 |
+
"crawled_at": now_iso(),
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def export_catalog(
|
| 61 |
+
storage: Storage,
|
| 62 |
+
parquet_path: str,
|
| 63 |
+
jsonl_path: Optional[str] = None,
|
| 64 |
+
min_count: int = 377,
|
| 65 |
+
limit: Optional[int] = None,
|
| 66 |
+
) -> None:
|
| 67 |
+
rows = storage.fetch_assessments()
|
| 68 |
+
logger.info("export.assessments.fetched", count=len(rows))
|
| 69 |
+
|
| 70 |
+
if len(rows) < min_count:
|
| 71 |
+
raise RuntimeError(f"Validation failed: expected at least {min_count} assessments, got {len(rows)}")
|
| 72 |
+
|
| 73 |
+
records = [_normalize_row(dict(r)) for r in rows]
|
| 74 |
+
df = pd.DataFrame.from_records(records)
|
| 75 |
+
if limit:
|
| 76 |
+
df = df.head(limit)
|
| 77 |
+
logger.info("export.limit.applied", limit=limit, rows=len(df))
|
| 78 |
+
|
| 79 |
+
Path(parquet_path).parent.mkdir(parents=True, exist_ok=True)
|
| 80 |
+
df.to_parquet(parquet_path, index=False)
|
| 81 |
+
logger.info("export.parquet.write", path=parquet_path, rows=len(df))
|
| 82 |
+
|
| 83 |
+
if jsonl_path:
|
| 84 |
+
df.to_json(jsonl_path, orient="records", lines=True, force_ascii=False)
|
| 85 |
+
logger.info("export.jsonl.write", path=jsonl_path, rows=len(df))
|
| 86 |
+
|
| 87 |
+
missing_desc = df["description"].isna().sum()
|
| 88 |
+
missing_duration = df["duration"].isna().sum()
|
| 89 |
+
logger.info(
|
| 90 |
+
"export.summary",
|
| 91 |
+
missing_description=missing_desc,
|
| 92 |
+
missing_duration=missing_duration,
|
| 93 |
+
test_type_counts=df["test_type"].value_counts(dropna=False).to_dict(),
|
| 94 |
+
)
|
crawler/fetcher.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import structlog
|
| 8 |
+
from playwright.async_api import async_playwright, Browser, Page
|
| 9 |
+
|
| 10 |
+
from crawler.storage import PageRecord
|
| 11 |
+
from crawler.utils import RateLimiter
|
| 12 |
+
|
| 13 |
+
logger = structlog.get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class FetchResult:
|
| 18 |
+
record: PageRecord
|
| 19 |
+
status: Optional[int]
|
| 20 |
+
html: Optional[str]
|
| 21 |
+
error: Optional[str]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class PlaywrightFetcher:
|
| 25 |
+
"""Thin wrapper around Playwright with polite rate limiting."""
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
user_agent: str,
|
| 30 |
+
rate_limiter: RateLimiter,
|
| 31 |
+
max_retries: int = 3,
|
| 32 |
+
) -> None:
|
| 33 |
+
self.user_agent = user_agent
|
| 34 |
+
self.rate_limiter = rate_limiter
|
| 35 |
+
self.max_retries = max_retries
|
| 36 |
+
self._playwright = None
|
| 37 |
+
self._browser: Optional[Browser] = None
|
| 38 |
+
self._page: Optional[Page] = None
|
| 39 |
+
|
| 40 |
+
async def __aenter__(self) -> "PlaywrightFetcher":
|
| 41 |
+
await self.start()
|
| 42 |
+
return self
|
| 43 |
+
|
| 44 |
+
async def __aexit__(self, exc_type, exc, tb) -> None:
|
| 45 |
+
await self.close()
|
| 46 |
+
|
| 47 |
+
async def start(self) -> None:
|
| 48 |
+
if self._page:
|
| 49 |
+
return
|
| 50 |
+
self._playwright = await async_playwright().start()
|
| 51 |
+
self._browser = await self._playwright.chromium.launch(headless=True)
|
| 52 |
+
context = await self._browser.new_context(user_agent=self.user_agent)
|
| 53 |
+
self._page = await context.new_page()
|
| 54 |
+
logger.info("fetcher.started", user_agent=self.user_agent)
|
| 55 |
+
|
| 56 |
+
async def close(self) -> None:
|
| 57 |
+
if self._browser:
|
| 58 |
+
await self._browser.close()
|
| 59 |
+
if self._playwright:
|
| 60 |
+
await self._playwright.stop()
|
| 61 |
+
self._browser = None
|
| 62 |
+
self._page = None
|
| 63 |
+
logger.info("fetcher.closed")
|
| 64 |
+
|
| 65 |
+
async def fetch(self, url: str, page_type: str) -> FetchResult:
|
| 66 |
+
assert self._page, "Fetcher must be started before fetch()"
|
| 67 |
+
attempt = 0
|
| 68 |
+
last_error: Optional[str] = None
|
| 69 |
+
html: Optional[str] = None
|
| 70 |
+
status: Optional[int] = None
|
| 71 |
+
|
| 72 |
+
while attempt < self.max_retries:
|
| 73 |
+
attempt += 1
|
| 74 |
+
self.rate_limiter.sleep()
|
| 75 |
+
logger.info("fetcher.request", url=url, attempt=attempt)
|
| 76 |
+
try:
|
| 77 |
+
response = await self._page.goto(url, wait_until="networkidle", timeout=20000)
|
| 78 |
+
status = response.status if response else None
|
| 79 |
+
html = await self._page.content()
|
| 80 |
+
return FetchResult(
|
| 81 |
+
record=PageRecord(url=url, page_type=page_type, http_status=status, html=html),
|
| 82 |
+
status=status,
|
| 83 |
+
html=html,
|
| 84 |
+
error=None,
|
| 85 |
+
)
|
| 86 |
+
except Exception as exc: # pragma: no cover - network variability
|
| 87 |
+
last_error = str(exc)
|
| 88 |
+
logger.warning("fetcher.request.error", url=url, error=last_error, attempt=attempt)
|
| 89 |
+
return FetchResult(
|
| 90 |
+
record=PageRecord(url=url, page_type=page_type, http_status=status, html=html, error=last_error),
|
| 91 |
+
status=status,
|
| 92 |
+
html=html,
|
| 93 |
+
error=last_error,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def fetch_sync(url: str, page_type: str, user_agent: str, rate_limiter: RateLimiter, max_retries: int = 3) -> FetchResult:
|
| 98 |
+
async def _runner():
|
| 99 |
+
async with PlaywrightFetcher(user_agent=user_agent, rate_limiter=rate_limiter, max_retries=max_retries) as fetcher:
|
| 100 |
+
return await fetcher.fetch(url, page_type)
|
| 101 |
+
|
| 102 |
+
return asyncio.run(_runner())
|
crawler/parser_catalog.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
from urllib.parse import urljoin
|
| 5 |
+
|
| 6 |
+
import structlog
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
from crawler.storage import (
|
| 10 |
+
PAGE_TYPE_DETAIL,
|
| 11 |
+
PARSE_PARSED,
|
| 12 |
+
PageRecord,
|
| 13 |
+
Storage,
|
| 14 |
+
)
|
| 15 |
+
from crawler.utils import canonicalize_url, now_iso
|
| 16 |
+
|
| 17 |
+
logger = structlog.get_logger(__name__)
|
| 18 |
+
|
| 19 |
+
ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"}
|
| 20 |
+
GREEN_TOKENS = ["green", "#8ac640", "rgb(138", "rgb(103", "0, 167, 83", "8ac640"]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _has_green_indicator(cell) -> bool:
|
| 24 |
+
for el in cell.find_all(True):
|
| 25 |
+
style = (el.get("style") or "").lower()
|
| 26 |
+
classes = " ".join(el.get("class", [])).lower() if isinstance(el.get("class"), list) else str(el.get("class") or "").lower()
|
| 27 |
+
combined = f"{style} {classes}"
|
| 28 |
+
if any(tok in combined for tok in GREEN_TOKENS):
|
| 29 |
+
return True
|
| 30 |
+
if "-yes" in classes or "catalogue__circle" in classes:
|
| 31 |
+
return True
|
| 32 |
+
fill = (el.get("fill") or "").lower()
|
| 33 |
+
if any(tok in fill for tok in GREEN_TOKENS):
|
| 34 |
+
return True
|
| 35 |
+
# Generic icon/dot detection (when color is applied via CSS)
|
| 36 |
+
if el.name in {"svg", "circle", "path", "i"}:
|
| 37 |
+
return True
|
| 38 |
+
if "dot" in classes or "indicator" in classes:
|
| 39 |
+
return True
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def extract_catalog_entries(html: str) -> List[dict]:
|
| 44 |
+
"""Parse catalog page for individual test solutions.
|
| 45 |
+
|
| 46 |
+
This is intentionally defensive; selectors may change on shl.com. We look for anchors within
|
| 47 |
+
sections that mention "Individual Test Solutions" or tables with product rows.
|
| 48 |
+
"""
|
| 49 |
+
soup = BeautifulSoup(html, "lxml")
|
| 50 |
+
entries = []
|
| 51 |
+
|
| 52 |
+
tables = soup.find_all("table")
|
| 53 |
+
for table in tables:
|
| 54 |
+
headers = " ".join(th.get_text(" ", strip=True) for th in table.find_all("th"))
|
| 55 |
+
if "Individual Test Solutions" not in headers and "Assessment" not in headers:
|
| 56 |
+
continue
|
| 57 |
+
for row in table.find_all("tr"):
|
| 58 |
+
link = row.find("a", href=True)
|
| 59 |
+
if not link:
|
| 60 |
+
continue
|
| 61 |
+
name = link.get_text(strip=True)
|
| 62 |
+
detail_url = link["href"]
|
| 63 |
+
badges_text = [span.get_text("", strip=True) for span in row.find_all("span")]
|
| 64 |
+
test_letters = []
|
| 65 |
+
for token in badges_text:
|
| 66 |
+
token = token.strip()
|
| 67 |
+
if len(token) == 1 and token in ALLOWED_TEST_TYPES:
|
| 68 |
+
test_letters.append(token)
|
| 69 |
+
test_type = ",".join(dict.fromkeys(test_letters)) or None
|
| 70 |
+
tds = row.find_all("td")
|
| 71 |
+
remote = None
|
| 72 |
+
adaptive = None
|
| 73 |
+
if len(tds) >= 3:
|
| 74 |
+
remote = _has_green_indicator(tds[1])
|
| 75 |
+
adaptive = _has_green_indicator(tds[2])
|
| 76 |
+
else:
|
| 77 |
+
flat_badges = " ".join(badges_text).lower()
|
| 78 |
+
remote = "remote" in flat_badges
|
| 79 |
+
adaptive = "adaptive" in flat_badges or "irt" in flat_badges
|
| 80 |
+
entries.append(
|
| 81 |
+
{
|
| 82 |
+
"name": name,
|
| 83 |
+
"url": detail_url,
|
| 84 |
+
"test_type": test_type or None,
|
| 85 |
+
"remote_support": remote if remote else None,
|
| 86 |
+
"adaptive_support": adaptive if adaptive else None,
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
return entries
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def find_next_pages(html: str, source_url: str) -> List[str]:
|
| 93 |
+
"""Find pagination links (Next or numbered) and resolve to absolute URLs."""
|
| 94 |
+
soup = BeautifulSoup(html, "lxml")
|
| 95 |
+
urls = []
|
| 96 |
+
for link in soup.find_all("a", href=True):
|
| 97 |
+
text = link.get_text(" ", strip=True).lower()
|
| 98 |
+
if "next" in text or text.isdigit():
|
| 99 |
+
urls.append(canonicalize_url(urljoin(source_url, link["href"])))
|
| 100 |
+
# de-duplicate while preserving order
|
| 101 |
+
seen = set()
|
| 102 |
+
deduped = []
|
| 103 |
+
for u in urls:
|
| 104 |
+
if u not in seen:
|
| 105 |
+
seen.add(u)
|
| 106 |
+
deduped.append(u)
|
| 107 |
+
return deduped
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def parse_catalog_page(html: str, source_url: str, storage: Storage) -> Tuple[int, List[str], List[str]]:
|
| 111 |
+
entries = extract_catalog_entries(html)
|
| 112 |
+
discovered_urls: List[str] = []
|
| 113 |
+
|
| 114 |
+
for entry in entries:
|
| 115 |
+
detail_url = canonicalize_url(urljoin(source_url, entry["url"]))
|
| 116 |
+
discovered_urls.append(detail_url)
|
| 117 |
+
storage.upsert_page(
|
| 118 |
+
PageRecord(
|
| 119 |
+
url=detail_url,
|
| 120 |
+
page_type=PAGE_TYPE_DETAIL,
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
storage.upsert_assessment(
|
| 124 |
+
{
|
| 125 |
+
"url": detail_url,
|
| 126 |
+
"name": entry.get("name"),
|
| 127 |
+
"test_type": entry.get("test_type"),
|
| 128 |
+
"remote_support": entry.get("remote_support"),
|
| 129 |
+
"adaptive_support": entry.get("adaptive_support"),
|
| 130 |
+
"source_catalog_page": canonicalize_url(source_url),
|
| 131 |
+
"discovered_at": now_iso(),
|
| 132 |
+
}
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
storage.update_parse_status(source_url, PARSE_PARSED)
|
| 136 |
+
next_pages = find_next_pages(html, source_url)
|
| 137 |
+
logger.info(
|
| 138 |
+
"catalog.parse.summary",
|
| 139 |
+
source_url=source_url,
|
| 140 |
+
discovered=len(discovered_urls),
|
| 141 |
+
next_pages=len(next_pages),
|
| 142 |
+
)
|
| 143 |
+
return len(entries), discovered_urls, next_pages
|
crawler/parser_detail.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, Iterable, List, Optional
|
| 5 |
+
|
| 6 |
+
import structlog
|
| 7 |
+
from bs4 import BeautifulSoup, Tag
|
| 8 |
+
|
| 9 |
+
from crawler.storage import PARSE_PARSED, Storage
|
| 10 |
+
from crawler.utils import canonicalize_url, now_iso
|
| 11 |
+
|
| 12 |
+
logger = structlog.get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
ALLOWED_TEST_TYPES = {"A", "B", "C", "D", "E", "K", "P", "S"}
|
| 15 |
+
STOP_LABELS = [
|
| 16 |
+
"Job levels",
|
| 17 |
+
"Job level",
|
| 18 |
+
"Languages",
|
| 19 |
+
"Language",
|
| 20 |
+
"Assessment length",
|
| 21 |
+
"Assessment Length",
|
| 22 |
+
"Test Type",
|
| 23 |
+
"Remote Testing",
|
| 24 |
+
"Adaptive/IRT",
|
| 25 |
+
"Adaptive",
|
| 26 |
+
"Downloads",
|
| 27 |
+
]
|
| 28 |
+
STOP_LABELS_LOWER = [s.lower() for s in STOP_LABELS]
|
| 29 |
+
TEST_TYPE_LABELS = {
|
| 30 |
+
"A": "Ability & Aptitude",
|
| 31 |
+
"B": "Biodata & Situational Judgement",
|
| 32 |
+
"C": "Competencies",
|
| 33 |
+
"D": "Development & 360",
|
| 34 |
+
"E": "Assessment Exercises",
|
| 35 |
+
"K": "Knowledge & Skills",
|
| 36 |
+
"P": "Personality & Behavior",
|
| 37 |
+
"S": "Simulations",
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _normalize(text: str) -> str:
|
| 42 |
+
return re.sub(r"\s+", " ", (text or "")).strip()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _extract_text(soup: BeautifulSoup, selector: str) -> Optional[str]:
|
| 46 |
+
node = soup.select_one(selector)
|
| 47 |
+
if not node:
|
| 48 |
+
return None
|
| 49 |
+
text = _normalize(node.get_text(" ", strip=True))
|
| 50 |
+
return text or None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _find_label_node(soup: BeautifulSoup, label: str) -> Optional[Tag]:
|
| 54 |
+
label_l = label.lower()
|
| 55 |
+
candidates = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "div", "span", "strong", "dt", "th", "li"])
|
| 56 |
+
for node in candidates:
|
| 57 |
+
txt = _normalize(node.get_text(" ", strip=True)).lower()
|
| 58 |
+
if txt == label_l or txt.startswith(label_l + ":") or txt.startswith(label_l):
|
| 59 |
+
return node
|
| 60 |
+
for node in candidates:
|
| 61 |
+
txt = _normalize(node.get_text(" ", strip=True)).lower()
|
| 62 |
+
if re.search(rf"\b{re.escape(label_l)}\b", txt):
|
| 63 |
+
return node
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _extract_section_until(soup: BeautifulSoup, start_label: str, stop_labels: Iterable[str]) -> Optional[str]:
|
| 68 |
+
start = _find_label_node(soup, start_label)
|
| 69 |
+
if not start:
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
chunks: List[str] = []
|
| 73 |
+
|
| 74 |
+
start_txt = _normalize(start.get_text(" ", strip=True))
|
| 75 |
+
if re.match(rf"^{re.escape(start_label)}\s*:", start_txt, flags=re.I):
|
| 76 |
+
after = re.split(rf"^{re.escape(start_label)}\s*:\s*", start_txt, flags=re.I)[-1]
|
| 77 |
+
if after:
|
| 78 |
+
chunks.append(after)
|
| 79 |
+
|
| 80 |
+
for node in start.find_all_next():
|
| 81 |
+
if node == start:
|
| 82 |
+
continue
|
| 83 |
+
if not isinstance(node, Tag):
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
node_txt = _normalize(node.get_text(" ", strip=True))
|
| 87 |
+
if not node_txt:
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
for stop in stop_labels:
|
| 91 |
+
if re.match(rf"^{re.escape(stop)}\b", node_txt, flags=re.I):
|
| 92 |
+
return _normalize(" ".join(chunks)) or None
|
| 93 |
+
|
| 94 |
+
if node.name in {"p", "li"}:
|
| 95 |
+
chunks.append(node_txt)
|
| 96 |
+
elif node.name in {"div", "span"} and len(node_txt) > 40:
|
| 97 |
+
chunks.append(node_txt)
|
| 98 |
+
|
| 99 |
+
return _normalize(" ".join(chunks)) or None
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _extract_segment(text: str, label: str, stop_labels: Iterable[str]) -> Optional[str]:
|
| 103 |
+
"""Extract substring after a label up to the next stop label in raw text."""
|
| 104 |
+
text_norm = _normalize(text)
|
| 105 |
+
lower = text_norm.lower()
|
| 106 |
+
label_l = label.lower()
|
| 107 |
+
start = lower.find(label_l)
|
| 108 |
+
if start == -1:
|
| 109 |
+
return None
|
| 110 |
+
start = start + len(label_l)
|
| 111 |
+
while start < len(text_norm) and text_norm[start] in " :":
|
| 112 |
+
start += 1
|
| 113 |
+
stop_pos = len(text_norm)
|
| 114 |
+
for stop in stop_labels:
|
| 115 |
+
pos = lower.find(stop, start)
|
| 116 |
+
if pos != -1 and pos < stop_pos:
|
| 117 |
+
stop_pos = pos
|
| 118 |
+
segment = text_norm[start:stop_pos].strip(" :-")
|
| 119 |
+
return segment or None
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _extract_kv_value(soup: BeautifulSoup, label: str) -> Optional[str]:
|
| 123 |
+
node = _find_label_node(soup, label)
|
| 124 |
+
if not node:
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
+
txt = _normalize(node.get_text(" ", strip=True))
|
| 128 |
+
m = re.match(rf"^{re.escape(label)}\s*:\s*(.+)$", txt, flags=re.I)
|
| 129 |
+
if m:
|
| 130 |
+
return m.group(1).strip() or None
|
| 131 |
+
|
| 132 |
+
remainder = re.sub(rf"^{re.escape(label)}\s*", "", txt, flags=re.I).strip(" :-")
|
| 133 |
+
if remainder and remainder.lower() != label.lower():
|
| 134 |
+
return remainder
|
| 135 |
+
|
| 136 |
+
for sib in node.next_siblings:
|
| 137 |
+
if isinstance(sib, Tag):
|
| 138 |
+
v = _normalize(sib.get_text(" ", strip=True))
|
| 139 |
+
if v:
|
| 140 |
+
return v
|
| 141 |
+
|
| 142 |
+
parent = node.parent if isinstance(node.parent, Tag) else None
|
| 143 |
+
if parent:
|
| 144 |
+
parent_txt = _normalize(parent.get_text(" ", strip=True))
|
| 145 |
+
parent_remainder = re.sub(rf"\b{re.escape(label)}\b", "", parent_txt, flags=re.I).strip(" :-")
|
| 146 |
+
if parent_remainder:
|
| 147 |
+
return parent_remainder
|
| 148 |
+
for sib in parent.find_next_siblings():
|
| 149 |
+
v = _normalize(sib.get_text(" ", strip=True))
|
| 150 |
+
if v:
|
| 151 |
+
return v
|
| 152 |
+
|
| 153 |
+
return None
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _extract_duration_minutes(soup: BeautifulSoup) -> Optional[int]:
|
| 157 |
+
text = _normalize(soup.get_text(" ", strip=True))
|
| 158 |
+
patterns = [
|
| 159 |
+
r"minutes?\s*=\s*(\d+)",
|
| 160 |
+
r"(\d+)\s*(?:minute|min)\b",
|
| 161 |
+
r"completion time.*?(\d+)\s*(?:minute|min)\b",
|
| 162 |
+
]
|
| 163 |
+
for pat in patterns:
|
| 164 |
+
m = re.search(pat, text, flags=re.I)
|
| 165 |
+
if m:
|
| 166 |
+
try:
|
| 167 |
+
return int(m.group(1))
|
| 168 |
+
except Exception:
|
| 169 |
+
continue
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def _extract_test_type_from_meta(soup: BeautifulSoup) -> Optional[str]:
|
| 174 |
+
label = _find_label_node(soup, "Test Type")
|
| 175 |
+
scope = label.parent if label and isinstance(label.parent, Tag) else label or soup
|
| 176 |
+
|
| 177 |
+
tokens: List[str] = []
|
| 178 |
+
for el in scope.find_all(["span", "button", "a"], limit=30):
|
| 179 |
+
t = _normalize(el.get_text("", strip=True))
|
| 180 |
+
if len(t) == 1 and t in ALLOWED_TEST_TYPES:
|
| 181 |
+
tokens.append(t)
|
| 182 |
+
if not tokens:
|
| 183 |
+
for el in label.find_all_next(["span", "button", "a"], limit=30) if label else []:
|
| 184 |
+
t = _normalize(el.get_text("", strip=True))
|
| 185 |
+
if len(t) == 1 and t in ALLOWED_TEST_TYPES:
|
| 186 |
+
tokens.append(t)
|
| 187 |
+
if not tokens:
|
| 188 |
+
return None
|
| 189 |
+
out = []
|
| 190 |
+
seen = set()
|
| 191 |
+
for t in tokens:
|
| 192 |
+
if t not in seen:
|
| 193 |
+
seen.add(t)
|
| 194 |
+
out.append(t)
|
| 195 |
+
return ",".join(out)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def _map_test_types_full(test_type: Optional[str]) -> Optional[str]:
|
| 199 |
+
if not test_type:
|
| 200 |
+
return None
|
| 201 |
+
parts = []
|
| 202 |
+
for token in test_type.split(","):
|
| 203 |
+
token = token.strip()
|
| 204 |
+
if not token:
|
| 205 |
+
continue
|
| 206 |
+
full = TEST_TYPE_LABELS.get(token)
|
| 207 |
+
if full:
|
| 208 |
+
parts.append(full)
|
| 209 |
+
return ", ".join(parts) if parts else None
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def _split_list(value: Optional[str]) -> Optional[list[str]]:
|
| 213 |
+
if not value:
|
| 214 |
+
return None
|
| 215 |
+
parts = [p.strip() for p in value.replace(";", ",").split(",") if p.strip()]
|
| 216 |
+
return parts or None
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def _is_positive_indicator(node: Tag) -> bool:
|
| 220 |
+
if not node:
|
| 221 |
+
return False
|
| 222 |
+
attrs = " ".join(
|
| 223 |
+
[
|
| 224 |
+
" ".join(node.get("class", [])) if isinstance(node.get("class"), list) else str(node.get("class") or ""),
|
| 225 |
+
str(node.get("aria-label") or ""),
|
| 226 |
+
str(node.get("title") or ""),
|
| 227 |
+
str(node.get("style") or ""),
|
| 228 |
+
]
|
| 229 |
+
).lower()
|
| 230 |
+
positive_tokens = ["green", "yes", "true", "available", "supported", "active", "enabled", "tick", "check", "on"]
|
| 231 |
+
return any(tok in attrs for tok in positive_tokens)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def _extract_boolean_from_meta(soup: BeautifulSoup, label_text: str) -> Optional[bool]:
|
| 235 |
+
label = _find_label_node(soup, label_text)
|
| 236 |
+
if not label:
|
| 237 |
+
return None
|
| 238 |
+
|
| 239 |
+
container = label.parent if isinstance(label.parent, Tag) else label
|
| 240 |
+
for el in container.find_all(["span", "i", "svg", "img"], limit=20):
|
| 241 |
+
if _is_positive_indicator(el):
|
| 242 |
+
return True
|
| 243 |
+
|
| 244 |
+
for el in label.find_all_next(["span", "i", "svg", "img"], limit=20):
|
| 245 |
+
if _is_positive_indicator(el):
|
| 246 |
+
return True
|
| 247 |
+
|
| 248 |
+
return False
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def extract_detail_fields(html: str) -> Dict:
|
| 252 |
+
soup = BeautifulSoup(html, "lxml")
|
| 253 |
+
|
| 254 |
+
title = _extract_text(soup, "h1") or _extract_text(soup, "title")
|
| 255 |
+
full_text = _normalize(soup.get_text(" ", strip=True))
|
| 256 |
+
description = _extract_segment(full_text, "description", STOP_LABELS_LOWER)
|
| 257 |
+
if not description:
|
| 258 |
+
description = _extract_section_until(soup, "Description", STOP_LABELS)
|
| 259 |
+
|
| 260 |
+
job_levels_raw = _extract_kv_value(soup, "Job levels") or _extract_segment(full_text, "job levels", STOP_LABELS_LOWER)
|
| 261 |
+
job_levels = _split_list(job_levels_raw)
|
| 262 |
+
languages_raw = _extract_kv_value(soup, "Languages") or _extract_segment(full_text, "languages", STOP_LABELS_LOWER)
|
| 263 |
+
languages = _split_list(languages_raw)
|
| 264 |
+
|
| 265 |
+
duration = _extract_duration_minutes(soup)
|
| 266 |
+
if duration is None:
|
| 267 |
+
segment = _extract_segment(full_text, "assessment length", STOP_LABELS_LOWER)
|
| 268 |
+
if segment:
|
| 269 |
+
match = re.search(r"(\d+)\s*(?:minute|min)", segment, flags=re.I)
|
| 270 |
+
if match:
|
| 271 |
+
try:
|
| 272 |
+
duration = int(match.group(1))
|
| 273 |
+
except Exception:
|
| 274 |
+
duration = None
|
| 275 |
+
|
| 276 |
+
test_type = _extract_test_type_from_meta(soup)
|
| 277 |
+
test_type_full = _map_test_types_full(test_type)
|
| 278 |
+
|
| 279 |
+
remote_support = _extract_boolean_from_meta(soup, "Remote Testing")
|
| 280 |
+
adaptive_support = _extract_boolean_from_meta(soup, "Adaptive/IRT")
|
| 281 |
+
if adaptive_support is None:
|
| 282 |
+
adaptive_support = _extract_boolean_from_meta(soup, "Adaptive")
|
| 283 |
+
if adaptive_support is None:
|
| 284 |
+
adaptive_support = _extract_boolean_from_meta(soup, "Adaptive/IRT Testing")
|
| 285 |
+
|
| 286 |
+
downloads = []
|
| 287 |
+
downloads_label = _find_label_node(soup, "Downloads")
|
| 288 |
+
scope = downloads_label.parent if downloads_label and isinstance(downloads_label.parent, Tag) else soup
|
| 289 |
+
for link in scope.find_all("a", href=True):
|
| 290 |
+
text = _normalize(link.get_text(" ", strip=True))
|
| 291 |
+
href = link["href"]
|
| 292 |
+
if text and any(keyword in text.lower() for keyword in ["report", "fact sheet", "sample", "pdf", "download", "brochure"]):
|
| 293 |
+
downloads.append({"text": text, "url": href})
|
| 294 |
+
|
| 295 |
+
return {
|
| 296 |
+
"name": title,
|
| 297 |
+
"description": description,
|
| 298 |
+
"test_type": test_type,
|
| 299 |
+
"test_type_full": test_type_full,
|
| 300 |
+
"remote_support": remote_support,
|
| 301 |
+
"adaptive_support": adaptive_support,
|
| 302 |
+
"duration_minutes": duration,
|
| 303 |
+
"job_levels": job_levels,
|
| 304 |
+
"languages": languages,
|
| 305 |
+
"downloads": downloads or None,
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def parse_detail_page(html: str, url: str, storage: Storage) -> Dict:
|
| 310 |
+
fields = extract_detail_fields(html)
|
| 311 |
+
storage.upsert_assessment(
|
| 312 |
+
{
|
| 313 |
+
"url": canonicalize_url(url),
|
| 314 |
+
**fields,
|
| 315 |
+
"last_updated_at": now_iso(),
|
| 316 |
+
}
|
| 317 |
+
)
|
| 318 |
+
storage.update_parse_status(url, PARSE_PARSED)
|
| 319 |
+
logger.info("detail.parse.success", url=url, name=fields.get("name"))
|
| 320 |
+
return fields
|
crawler/qa_checks.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any, Dict, Optional
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def load_catalog(path: str) -> pd.DataFrame:
|
| 12 |
+
p = Path(path)
|
| 13 |
+
if not p.exists():
|
| 14 |
+
raise FileNotFoundError(f"Catalog file not found: {path}")
|
| 15 |
+
if p.suffix == ".jsonl":
|
| 16 |
+
return pd.read_json(path, lines=True)
|
| 17 |
+
if p.suffix in {".parquet", ".pq"}:
|
| 18 |
+
return pd.read_parquet(path)
|
| 19 |
+
raise ValueError(f"Unsupported catalog format: {path}")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def qa_checks(df: pd.DataFrame) -> Dict[str, Any]:
|
| 23 |
+
total = len(df)
|
| 24 |
+
|
| 25 |
+
def pct_missing(col: str) -> float:
|
| 26 |
+
return float(df[col].isna().mean()) * 100.0 if col in df else 100.0
|
| 27 |
+
|
| 28 |
+
bool_sanity = {}
|
| 29 |
+
for col in ["remote_support", "adaptive_support"]:
|
| 30 |
+
if col in df:
|
| 31 |
+
bool_sanity[col] = bool(
|
| 32 |
+
df[col].dropna().apply(lambda x: isinstance(x, (bool, int))).all()
|
| 33 |
+
)
|
| 34 |
+
else:
|
| 35 |
+
bool_sanity[col] = False
|
| 36 |
+
|
| 37 |
+
description_lengths = df["description"].dropna().apply(lambda x: len(str(x))) if "description" in df else pd.Series(dtype=int)
|
| 38 |
+
min_desc_len: Optional[int] = int(description_lengths.min()) if not description_lengths.empty else None
|
| 39 |
+
|
| 40 |
+
return {
|
| 41 |
+
"total": total,
|
| 42 |
+
"count_gate": total >= 377,
|
| 43 |
+
"missing_pct": {
|
| 44 |
+
"description": pct_missing("description"),
|
| 45 |
+
"test_type": pct_missing("test_type"),
|
| 46 |
+
"remote_support": pct_missing("remote_support"),
|
| 47 |
+
"adaptive_support": pct_missing("adaptive_support"),
|
| 48 |
+
"duration_minutes": pct_missing("duration") if "duration" in df else pct_missing("duration_minutes"),
|
| 49 |
+
},
|
| 50 |
+
"url_uniqueness": {
|
| 51 |
+
"unique_urls": int(df["url"].nunique()) if "url" in df else 0,
|
| 52 |
+
"matches_row_count": bool("url" in df and df["url"].nunique() == total),
|
| 53 |
+
},
|
| 54 |
+
"description_quality": {
|
| 55 |
+
"min_length": min_desc_len,
|
| 56 |
+
"passed_min_30": bool(min_desc_len is not None and min_desc_len >= 30),
|
| 57 |
+
},
|
| 58 |
+
"test_type_distribution": df["test_type"].value_counts(dropna=False).to_dict() if "test_type" in df else {},
|
| 59 |
+
"boolean_sanity": bool_sanity,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def main() -> None:
|
| 64 |
+
if len(sys.argv) < 2:
|
| 65 |
+
print("Usage: python qa_checks.py <catalog.jsonl|catalog.parquet>")
|
| 66 |
+
sys.exit(1)
|
| 67 |
+
path = sys.argv[1]
|
| 68 |
+
df = load_catalog(path)
|
| 69 |
+
results = qa_checks(df)
|
| 70 |
+
print(json.dumps(results, indent=2))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
main()
|
crawler/robots.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import urllib.robotparser
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
|
| 7 |
+
import structlog
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
logger = structlog.get_logger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class RobotsManager:
|
| 15 |
+
robots_url: str
|
| 16 |
+
user_agent: str
|
| 17 |
+
|
| 18 |
+
def __post_init__(self) -> None:
|
| 19 |
+
self._parser = urllib.robotparser.RobotFileParser()
|
| 20 |
+
|
| 21 |
+
def load(self) -> None:
|
| 22 |
+
logger.info("robots.load.start", robots_url=self.robots_url)
|
| 23 |
+
self._parser.set_url(self.robots_url)
|
| 24 |
+
try:
|
| 25 |
+
self._parser.read()
|
| 26 |
+
logger.info("robots.load.success", can_fetch_all=self._parser.can_fetch(self.user_agent, "*"))
|
| 27 |
+
except Exception as exc: # pragma: no cover - network errors are logged
|
| 28 |
+
logger.warning("robots.load.failed", error=str(exc))
|
| 29 |
+
|
| 30 |
+
def is_allowed(self, url: str) -> bool:
|
| 31 |
+
try:
|
| 32 |
+
return self._parser.can_fetch(self.user_agent, url)
|
| 33 |
+
except Exception as exc: # pragma: no cover
|
| 34 |
+
logger.warning("robots.check.error", url=url, error=str(exc))
|
| 35 |
+
return False
|
crawler/run.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import asyncio
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import structlog
|
| 10 |
+
|
| 11 |
+
from config import load_config
|
| 12 |
+
from crawler.export import export_catalog
|
| 13 |
+
from crawler.fetcher import PlaywrightFetcher
|
| 14 |
+
from crawler.parser_catalog import parse_catalog_page
|
| 15 |
+
from crawler.parser_detail import parse_detail_page
|
| 16 |
+
from crawler.robots import RobotsManager
|
| 17 |
+
from crawler.storage import (
|
| 18 |
+
PAGE_TYPE_CATALOG,
|
| 19 |
+
PAGE_TYPE_DETAIL,
|
| 20 |
+
PARSE_PENDING,
|
| 21 |
+
Storage,
|
| 22 |
+
)
|
| 23 |
+
from crawler.utils import RateLimiter
|
| 24 |
+
|
| 25 |
+
logger = structlog.get_logger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def configure_logging(log_level: str = "INFO") -> None:
|
| 29 |
+
logging.basicConfig(level=getattr(logging, log_level.upper(), logging.INFO))
|
| 30 |
+
structlog.configure(
|
| 31 |
+
wrapper_class=structlog.make_filtering_bound_logger(getattr(logging, log_level.upper(), logging.INFO)),
|
| 32 |
+
processors=[
|
| 33 |
+
structlog.processors.add_log_level,
|
| 34 |
+
structlog.processors.TimeStamper(fmt="iso"),
|
| 35 |
+
structlog.processors.JSONRenderer(),
|
| 36 |
+
],
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
async def crawl_catalog(
|
| 41 |
+
start_url: str,
|
| 42 |
+
storage: Storage,
|
| 43 |
+
fetcher: PlaywrightFetcher,
|
| 44 |
+
robots: RobotsManager,
|
| 45 |
+
max_discover: int | None = None,
|
| 46 |
+
allow_robots_bypass: bool = False,
|
| 47 |
+
) -> None:
|
| 48 |
+
frontier = [start_url]
|
| 49 |
+
seen = set()
|
| 50 |
+
total_discovered = 0
|
| 51 |
+
|
| 52 |
+
while frontier:
|
| 53 |
+
url = frontier.pop(0)
|
| 54 |
+
if url in seen:
|
| 55 |
+
continue
|
| 56 |
+
seen.add(url)
|
| 57 |
+
allowed = allow_robots_bypass or robots.is_allowed(url)
|
| 58 |
+
if not allowed:
|
| 59 |
+
logger.warning("catalog.fetch.disallowed", url=url)
|
| 60 |
+
continue
|
| 61 |
+
if allow_robots_bypass:
|
| 62 |
+
logger.warning("catalog.fetch.disallowed.bypassed", url=url)
|
| 63 |
+
result = await fetcher.fetch(url, page_type=PAGE_TYPE_CATALOG)
|
| 64 |
+
storage.upsert_page(result.record)
|
| 65 |
+
if result.error or not result.html:
|
| 66 |
+
logger.error("catalog.fetch.failed", url=url, error=result.error)
|
| 67 |
+
continue
|
| 68 |
+
_, discovered_urls, next_pages = parse_catalog_page(result.html, source_url=url, storage=storage)
|
| 69 |
+
total_discovered += len(discovered_urls)
|
| 70 |
+
for next_url in next_pages:
|
| 71 |
+
if next_url not in seen:
|
| 72 |
+
frontier.append(next_url)
|
| 73 |
+
if max_discover and total_discovered >= max_discover:
|
| 74 |
+
logger.info("catalog.max_discover.reached", total=total_discovered, max=max_discover)
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
async def crawl_details(
|
| 79 |
+
storage: Storage,
|
| 80 |
+
fetcher: PlaywrightFetcher,
|
| 81 |
+
robots: RobotsManager,
|
| 82 |
+
allow_robots_bypass: bool = False,
|
| 83 |
+
) -> None:
|
| 84 |
+
pending = storage.get_pages_by_type(PAGE_TYPE_DETAIL, parse_status=PARSE_PENDING)
|
| 85 |
+
logger.info("detail.queue", pending=len(pending))
|
| 86 |
+
for page in pending:
|
| 87 |
+
url = page["url"]
|
| 88 |
+
allowed = allow_robots_bypass or robots.is_allowed(url)
|
| 89 |
+
if not allowed:
|
| 90 |
+
logger.warning("detail.fetch.disallowed", url=url)
|
| 91 |
+
continue
|
| 92 |
+
if allow_robots_bypass:
|
| 93 |
+
logger.warning("detail.fetch.disallowed.bypassed", url=url)
|
| 94 |
+
result = await fetcher.fetch(url, page_type=PAGE_TYPE_DETAIL)
|
| 95 |
+
storage.upsert_page(result.record)
|
| 96 |
+
if result.error or not result.html:
|
| 97 |
+
logger.error("detail.fetch.failed", url=url, error=result.error)
|
| 98 |
+
continue
|
| 99 |
+
parse_detail_page(result.html, url=url, storage=storage)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def main(argv: Optional[list[str]] = None) -> None:
|
| 103 |
+
parser = argparse.ArgumentParser(description="Crawler pipeline")
|
| 104 |
+
parser.add_argument("--mode", choices=["crawl_all", "discover", "details", "export"], default="crawl_all")
|
| 105 |
+
parser.add_argument("--config", type=str, default=os.environ.get("CONFIG_PATH", "configs/config.yaml"))
|
| 106 |
+
parser.add_argument("--parquet", type=str, default="data/catalog.parquet")
|
| 107 |
+
parser.add_argument("--jsonl", type=str, default="data/catalog.jsonl")
|
| 108 |
+
parser.add_argument(
|
| 109 |
+
"--max-discover",
|
| 110 |
+
type=int,
|
| 111 |
+
default=None,
|
| 112 |
+
help="Limit number of detail URLs discovered (for smoke tests)",
|
| 113 |
+
)
|
| 114 |
+
parser.add_argument(
|
| 115 |
+
"--limit-export",
|
| 116 |
+
type=int,
|
| 117 |
+
default=None,
|
| 118 |
+
help="Limit number of rows exported (for smoke tests)",
|
| 119 |
+
)
|
| 120 |
+
parser.add_argument(
|
| 121 |
+
"--allow-robots-bypass",
|
| 122 |
+
action="store_true",
|
| 123 |
+
help="Bypass robots.txt disallow (for testing; use responsibly)",
|
| 124 |
+
)
|
| 125 |
+
args = parser.parse_args(argv)
|
| 126 |
+
|
| 127 |
+
config = load_config(args.config)
|
| 128 |
+
configure_logging(config.get("app", {}).get("log_level", "INFO"))
|
| 129 |
+
crawler_cfg = config.get("crawler", {})
|
| 130 |
+
rate_limiter = RateLimiter(
|
| 131 |
+
base_delay=float(os.environ.get("REQUEST_DELAY_SECONDS", crawler_cfg.get("request_delay_seconds", 1.5))),
|
| 132 |
+
jitter=float(os.environ.get("JITTER_SECONDS", crawler_cfg.get("jitter_seconds", 0.5))),
|
| 133 |
+
)
|
| 134 |
+
user_agent = os.environ.get("USER_AGENT", crawler_cfg.get("user_agent"))
|
| 135 |
+
start_url = os.environ.get("START_URL", crawler_cfg.get("start_url"))
|
| 136 |
+
max_retries = int(os.environ.get("MAX_RETRIES", crawler_cfg.get("max_retries", 3)))
|
| 137 |
+
sqlite_path = crawler_cfg.get("sqlite_path", "data/crawler.db")
|
| 138 |
+
allow_bypass = args.allow_robots_bypass or os.environ.get("ALLOW_ROBOTS_BYPASS", "").lower() in {"1", "true", "yes"}
|
| 139 |
+
|
| 140 |
+
storage = Storage(sqlite_path)
|
| 141 |
+
robots = RobotsManager(robots_url="https://www.shl.com/robots.txt", user_agent=user_agent)
|
| 142 |
+
robots.load()
|
| 143 |
+
|
| 144 |
+
async def _runner():
|
| 145 |
+
async with PlaywrightFetcher(user_agent=user_agent, rate_limiter=rate_limiter, max_retries=max_retries) as fetcher:
|
| 146 |
+
if args.mode in {"crawl_all", "discover"}:
|
| 147 |
+
await crawl_catalog(start_url, storage, fetcher, robots, max_discover=args.max_discover, allow_robots_bypass=allow_bypass)
|
| 148 |
+
if args.mode in {"crawl_all", "details"}:
|
| 149 |
+
await crawl_details(storage, fetcher, robots, allow_robots_bypass=allow_bypass)
|
| 150 |
+
|
| 151 |
+
if args.mode in {"crawl_all", "discover", "details"}:
|
| 152 |
+
asyncio.run(_runner())
|
| 153 |
+
|
| 154 |
+
if args.mode == "export":
|
| 155 |
+
export_catalog(
|
| 156 |
+
storage,
|
| 157 |
+
parquet_path=args.parquet,
|
| 158 |
+
jsonl_path=args.jsonl,
|
| 159 |
+
limit=args.limit_export,
|
| 160 |
+
min_count=1 if args.limit_export else 377,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
if __name__ == "__main__":
|
| 165 |
+
main()
|
crawler/storage.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import sqlite3
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Dict, Iterable, List, Optional
|
| 9 |
+
|
| 10 |
+
import structlog
|
| 11 |
+
|
| 12 |
+
from crawler.utils import canonicalize_url, make_assessment_id, now_iso
|
| 13 |
+
|
| 14 |
+
logger = structlog.get_logger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
PAGE_TYPE_CATALOG = "CATALOG"
|
| 18 |
+
PAGE_TYPE_DETAIL = "DETAIL"
|
| 19 |
+
|
| 20 |
+
PARSE_PENDING = "PENDING"
|
| 21 |
+
PARSE_PARSED = "PARSED"
|
| 22 |
+
PARSE_FAILED = "FAILED"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class PageRecord:
|
| 27 |
+
url: str
|
| 28 |
+
page_type: str
|
| 29 |
+
http_status: Optional[int] = None
|
| 30 |
+
html: Optional[str] = None
|
| 31 |
+
error: Optional[str] = None
|
| 32 |
+
retry_count: int = 0
|
| 33 |
+
parse_status: str = PARSE_PENDING
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Storage:
|
| 37 |
+
def __init__(self, db_path: str) -> None:
|
| 38 |
+
self.db_path = db_path
|
| 39 |
+
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
self.conn = sqlite3.connect(self.db_path)
|
| 41 |
+
self.conn.row_factory = sqlite3.Row
|
| 42 |
+
self.ensure_schema()
|
| 43 |
+
|
| 44 |
+
def ensure_schema(self) -> None:
|
| 45 |
+
logger.info("storage.schema.ensure", db_path=self.db_path)
|
| 46 |
+
cur = self.conn.cursor()
|
| 47 |
+
cur.execute(
|
| 48 |
+
"""
|
| 49 |
+
CREATE TABLE IF NOT EXISTS pages (
|
| 50 |
+
url TEXT PRIMARY KEY,
|
| 51 |
+
url_canonical TEXT UNIQUE,
|
| 52 |
+
page_type TEXT,
|
| 53 |
+
http_status INTEGER,
|
| 54 |
+
fetched_at TEXT,
|
| 55 |
+
html TEXT,
|
| 56 |
+
error TEXT,
|
| 57 |
+
retry_count INTEGER DEFAULT 0,
|
| 58 |
+
parse_status TEXT DEFAULT 'PENDING'
|
| 59 |
+
)
|
| 60 |
+
"""
|
| 61 |
+
)
|
| 62 |
+
cur.execute(
|
| 63 |
+
"""
|
| 64 |
+
CREATE TABLE IF NOT EXISTS assessments (
|
| 65 |
+
assessment_id TEXT PRIMARY KEY,
|
| 66 |
+
url TEXT UNIQUE,
|
| 67 |
+
name TEXT,
|
| 68 |
+
description TEXT,
|
| 69 |
+
test_type TEXT,
|
| 70 |
+
test_type_full TEXT,
|
| 71 |
+
remote_support INTEGER,
|
| 72 |
+
adaptive_support INTEGER,
|
| 73 |
+
duration_minutes INTEGER,
|
| 74 |
+
job_levels TEXT,
|
| 75 |
+
languages TEXT,
|
| 76 |
+
downloads TEXT,
|
| 77 |
+
source_catalog_page TEXT,
|
| 78 |
+
discovered_at TEXT,
|
| 79 |
+
last_updated_at TEXT
|
| 80 |
+
)
|
| 81 |
+
"""
|
| 82 |
+
)
|
| 83 |
+
cur.execute(
|
| 84 |
+
"""
|
| 85 |
+
CREATE TABLE IF NOT EXISTS crawl_meta (
|
| 86 |
+
run_id TEXT,
|
| 87 |
+
started_at TEXT,
|
| 88 |
+
finished_at TEXT,
|
| 89 |
+
total_catalog_pages INTEGER,
|
| 90 |
+
total_detail_pages INTEGER,
|
| 91 |
+
individual_assessment_count INTEGER,
|
| 92 |
+
notes TEXT
|
| 93 |
+
)
|
| 94 |
+
"""
|
| 95 |
+
)
|
| 96 |
+
self.conn.commit()
|
| 97 |
+
|
| 98 |
+
def upsert_page(self, record: PageRecord) -> None:
|
| 99 |
+
canonical = canonicalize_url(record.url)
|
| 100 |
+
logger.debug("storage.page.upsert", url=record.url, page_type=record.page_type)
|
| 101 |
+
self.conn.execute(
|
| 102 |
+
"""
|
| 103 |
+
INSERT INTO pages (url, url_canonical, page_type, http_status, fetched_at, html, error, retry_count, parse_status)
|
| 104 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 105 |
+
ON CONFLICT(url) DO UPDATE SET
|
| 106 |
+
page_type=excluded.page_type,
|
| 107 |
+
http_status=excluded.http_status,
|
| 108 |
+
fetched_at=excluded.fetched_at,
|
| 109 |
+
html=excluded.html,
|
| 110 |
+
error=excluded.error,
|
| 111 |
+
retry_count=excluded.retry_count,
|
| 112 |
+
parse_status=excluded.parse_status
|
| 113 |
+
""",
|
| 114 |
+
(
|
| 115 |
+
record.url,
|
| 116 |
+
canonical,
|
| 117 |
+
record.page_type,
|
| 118 |
+
record.http_status,
|
| 119 |
+
now_iso(),
|
| 120 |
+
record.html,
|
| 121 |
+
record.error,
|
| 122 |
+
record.retry_count,
|
| 123 |
+
record.parse_status,
|
| 124 |
+
),
|
| 125 |
+
)
|
| 126 |
+
self.conn.commit()
|
| 127 |
+
|
| 128 |
+
def update_parse_status(self, url: str, status: str) -> None:
|
| 129 |
+
self.conn.execute("UPDATE pages SET parse_status=? WHERE url=?", (status, url))
|
| 130 |
+
self.conn.commit()
|
| 131 |
+
|
| 132 |
+
def get_pages_by_type(self, page_type: str, parse_status: Optional[str] = None) -> List[sqlite3.Row]:
|
| 133 |
+
cur = self.conn.cursor()
|
| 134 |
+
if parse_status:
|
| 135 |
+
cur.execute(
|
| 136 |
+
"SELECT * FROM pages WHERE page_type=? AND parse_status=? ORDER BY url", (page_type, parse_status)
|
| 137 |
+
)
|
| 138 |
+
else:
|
| 139 |
+
cur.execute("SELECT * FROM pages WHERE page_type=? ORDER BY url", (page_type,))
|
| 140 |
+
return cur.fetchall()
|
| 141 |
+
|
| 142 |
+
def upsert_assessment(self, data: Dict[str, Any]) -> None:
|
| 143 |
+
url = data["url"]
|
| 144 |
+
assessment_id = data.get("assessment_id") or make_assessment_id(url)
|
| 145 |
+
data = {**data, "assessment_id": assessment_id}
|
| 146 |
+
downloads = data.get("downloads")
|
| 147 |
+
if downloads is not None and not isinstance(downloads, str):
|
| 148 |
+
downloads = json.dumps(downloads)
|
| 149 |
+
job_levels = data.get("job_levels")
|
| 150 |
+
if isinstance(job_levels, (list, tuple)):
|
| 151 |
+
job_levels = json.dumps(job_levels)
|
| 152 |
+
languages = data.get("languages")
|
| 153 |
+
if isinstance(languages, (list, tuple)):
|
| 154 |
+
languages = json.dumps(languages)
|
| 155 |
+
|
| 156 |
+
logger.debug("storage.assessment.upsert", url=url)
|
| 157 |
+
self.conn.execute(
|
| 158 |
+
"""
|
| 159 |
+
INSERT INTO assessments (
|
| 160 |
+
assessment_id, url, name, description, test_type, test_type_full, remote_support, adaptive_support,
|
| 161 |
+
duration_minutes, job_levels, languages, downloads, source_catalog_page, discovered_at, last_updated_at
|
| 162 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 163 |
+
ON CONFLICT(assessment_id) DO UPDATE SET
|
| 164 |
+
url=excluded.url,
|
| 165 |
+
name=COALESCE(excluded.name, assessments.name),
|
| 166 |
+
description=COALESCE(excluded.description, assessments.description),
|
| 167 |
+
test_type=COALESCE(excluded.test_type, assessments.test_type),
|
| 168 |
+
test_type_full=COALESCE(excluded.test_type_full, assessments.test_type_full),
|
| 169 |
+
remote_support=COALESCE(excluded.remote_support, assessments.remote_support),
|
| 170 |
+
adaptive_support=COALESCE(excluded.adaptive_support, assessments.adaptive_support),
|
| 171 |
+
duration_minutes=COALESCE(excluded.duration_minutes, assessments.duration_minutes),
|
| 172 |
+
job_levels=COALESCE(excluded.job_levels, assessments.job_levels),
|
| 173 |
+
languages=COALESCE(excluded.languages, assessments.languages),
|
| 174 |
+
downloads=COALESCE(excluded.downloads, assessments.downloads),
|
| 175 |
+
source_catalog_page=COALESCE(excluded.source_catalog_page, assessments.source_catalog_page),
|
| 176 |
+
last_updated_at=excluded.last_updated_at
|
| 177 |
+
""",
|
| 178 |
+
(
|
| 179 |
+
data["assessment_id"],
|
| 180 |
+
url,
|
| 181 |
+
data.get("name"),
|
| 182 |
+
data.get("description"),
|
| 183 |
+
data.get("test_type"),
|
| 184 |
+
data.get("test_type_full"),
|
| 185 |
+
data.get("remote_support"),
|
| 186 |
+
data.get("adaptive_support"),
|
| 187 |
+
data.get("duration_minutes"),
|
| 188 |
+
job_levels,
|
| 189 |
+
languages,
|
| 190 |
+
downloads,
|
| 191 |
+
data.get("source_catalog_page"),
|
| 192 |
+
data.get("discovered_at") or now_iso(),
|
| 193 |
+
data.get("last_updated_at") or now_iso(),
|
| 194 |
+
),
|
| 195 |
+
)
|
| 196 |
+
self.conn.commit()
|
| 197 |
+
|
| 198 |
+
def fetch_assessments(self) -> List[sqlite3.Row]:
|
| 199 |
+
cur = self.conn.cursor()
|
| 200 |
+
cur.execute("SELECT * FROM assessments ORDER BY name")
|
| 201 |
+
return cur.fetchall()
|
| 202 |
+
|
| 203 |
+
def count_assessments(self) -> int:
|
| 204 |
+
cur = self.conn.cursor()
|
| 205 |
+
cur.execute("SELECT COUNT(*) FROM assessments")
|
| 206 |
+
return cur.fetchone()[0]
|
| 207 |
+
|
| 208 |
+
def close(self) -> None:
|
| 209 |
+
self.conn.close()
|
crawler/utils.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import random
|
| 5 |
+
import time
|
| 6 |
+
import urllib.parse
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
from typing import Iterable
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def canonicalize_url(url: str) -> str:
|
| 12 |
+
"""Normalize URL by stripping fragments/query trackers and trailing slashes."""
|
| 13 |
+
parsed = urllib.parse.urlparse(url)
|
| 14 |
+
query = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
|
| 15 |
+
filtered_query = [(k, v) for k, v in query if not k.lower().startswith("utm_")]
|
| 16 |
+
cleaned_query = urllib.parse.urlencode(filtered_query, doseq=True)
|
| 17 |
+
path = parsed.path if parsed.path != "/" else ""
|
| 18 |
+
# Keep trailing slash for non-root paths to avoid 404s on detail pages.
|
| 19 |
+
if path and not path.endswith("/"):
|
| 20 |
+
path = path
|
| 21 |
+
normalized = parsed._replace(query=cleaned_query, fragment="", path=path).geturl()
|
| 22 |
+
return normalized or url
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def make_assessment_id(url: str) -> str:
|
| 26 |
+
"""Deterministic ID from canonical URL."""
|
| 27 |
+
canonical = canonicalize_url(url)
|
| 28 |
+
return hashlib.sha1(canonical.encode("utf-8")).hexdigest()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def now_iso() -> str:
|
| 32 |
+
return datetime.now(timezone.utc).isoformat()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class RateLimiter:
|
| 36 |
+
"""Coarse rate limiter with jitter to respect polite crawling."""
|
| 37 |
+
|
| 38 |
+
def __init__(self, base_delay: float, jitter: float) -> None:
|
| 39 |
+
self.base_delay = base_delay
|
| 40 |
+
self.jitter = jitter
|
| 41 |
+
self._last_ts = 0.0
|
| 42 |
+
|
| 43 |
+
def sleep(self) -> None:
|
| 44 |
+
now = time.monotonic()
|
| 45 |
+
elapsed = now - self._last_ts
|
| 46 |
+
delay = self.base_delay + random.uniform(0, self.jitter)
|
| 47 |
+
if elapsed < delay:
|
| 48 |
+
time.sleep(delay - elapsed)
|
| 49 |
+
self._last_ts = time.monotonic()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def batched(iterable: Iterable, size: int):
|
| 53 |
+
"""Yield fixed-size batches from an iterable."""
|
| 54 |
+
batch = []
|
| 55 |
+
for item in iterable:
|
| 56 |
+
batch.append(item)
|
| 57 |
+
if len(batch) == size:
|
| 58 |
+
yield batch
|
| 59 |
+
batch = []
|
| 60 |
+
if batch:
|
| 61 |
+
yield batch
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.9"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
api:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "8000:8000"
|
| 8 |
+
environment:
|
| 9 |
+
LLM_MODEL: Qwen/Qwen2.5-1.5B-Instruct
|
| 10 |
+
HF_HOME: /cache/hf
|
| 11 |
+
volumes:
|
| 12 |
+
- ./data:/app/data:ro
|
| 13 |
+
- ./models:/app/models:ro
|
| 14 |
+
- hf-cache:/cache/hf
|
| 15 |
+
|
| 16 |
+
web:
|
| 17 |
+
build: ./frontend
|
| 18 |
+
ports:
|
| 19 |
+
- "3000:3000"
|
| 20 |
+
environment:
|
| 21 |
+
NEXT_PUBLIC_API_BASE: http://api:8000
|
| 22 |
+
depends_on:
|
| 23 |
+
- api
|
| 24 |
+
|
| 25 |
+
volumes:
|
| 26 |
+
hf-cache:
|
embeddings/generator.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import List, Tuple
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
from data.catalog_loader import make_assessment_id
|
| 13 |
+
from models.embedding_model import EmbeddingModel
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def generate_embeddings(catalog_path: str, model_name: str, batch_size: int = 32, output_dir: str = "data/embeddings") -> Tuple[np.ndarray, List[str]]:
|
| 17 |
+
df = pd.read_json(catalog_path, lines=True) if catalog_path.endswith(".jsonl") else pd.read_parquet(catalog_path)
|
| 18 |
+
if "assessment_id" not in df.columns:
|
| 19 |
+
if "url" in df.columns:
|
| 20 |
+
df["assessment_id"] = df["url"].apply(make_assessment_id)
|
| 21 |
+
else:
|
| 22 |
+
raise KeyError("assessment_id not found and url missing to derive it.")
|
| 23 |
+
df = df.sort_values("assessment_id")
|
| 24 |
+
texts = df["doc_text"].tolist()
|
| 25 |
+
ids = df["assessment_id"].tolist()
|
| 26 |
+
|
| 27 |
+
model = EmbeddingModel(model_name)
|
| 28 |
+
embeddings: List[np.ndarray] = []
|
| 29 |
+
start = time.time()
|
| 30 |
+
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
|
| 31 |
+
batch = texts[i : i + batch_size]
|
| 32 |
+
embeds = model.encode(batch, normalize=True, batch_size=batch_size, is_query=False)
|
| 33 |
+
embeddings.append(embeds)
|
| 34 |
+
embeddings_arr = np.vstack(embeddings).astype(np.float32)
|
| 35 |
+
|
| 36 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
| 37 |
+
np.save(Path(output_dir) / "embeddings.npy", embeddings_arr)
|
| 38 |
+
with open(Path(output_dir) / "assessment_ids.json", "w") as f:
|
| 39 |
+
json.dump(ids, f, indent=2)
|
| 40 |
+
|
| 41 |
+
total_time = time.time() - start
|
| 42 |
+
log = {
|
| 43 |
+
"generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
| 44 |
+
"model_name": model_name,
|
| 45 |
+
"num_documents": len(texts),
|
| 46 |
+
"embedding_dim": embeddings_arr.shape[1],
|
| 47 |
+
"batch_size": batch_size,
|
| 48 |
+
"total_time_seconds": total_time,
|
| 49 |
+
"avg_time_per_doc_ms": (total_time / len(texts) * 1000) if len(texts) else None,
|
| 50 |
+
"normalized": True,
|
| 51 |
+
"catalog_path": catalog_path,
|
| 52 |
+
}
|
| 53 |
+
with open(Path(output_dir) / "generation_log.json", "w") as f:
|
| 54 |
+
json.dump(log, f, indent=2)
|
| 55 |
+
return embeddings_arr, ids
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
import argparse
|
| 60 |
+
|
| 61 |
+
parser = argparse.ArgumentParser()
|
| 62 |
+
parser.add_argument("--catalog", required=True, help="Enriched catalog with doc_text")
|
| 63 |
+
parser.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
|
| 64 |
+
parser.add_argument("--batch-size", type=int, default=32)
|
| 65 |
+
parser.add_argument("--output-dir", default="data/embeddings")
|
| 66 |
+
args = parser.parse_args()
|
| 67 |
+
|
| 68 |
+
generate_embeddings(args.catalog, args.model, batch_size=args.batch_size, output_dir=args.output_dir)
|
eval/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation datasets, metrics, and experiments."""
|
eval/compare_runs.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def load_metrics(path: str) -> dict:
|
| 9 |
+
with open(path) as f:
|
| 10 |
+
return json.load(f)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def compare(run_a: str, run_b: str) -> dict:
|
| 14 |
+
m_a = load_metrics(Path(run_a) / "metrics.json")
|
| 15 |
+
m_b = load_metrics(Path(run_b) / "metrics.json")
|
| 16 |
+
def extract(m):
|
| 17 |
+
return {
|
| 18 |
+
"train_r10": m["train"]["recall@10"],
|
| 19 |
+
"val_r10": m["val"]["recall@10"],
|
| 20 |
+
"val_mrr10": m["val"]["mrr@10"],
|
| 21 |
+
}
|
| 22 |
+
return {"run_a": run_a, "run_b": run_b, "metrics_a": extract(m_a), "metrics_b": extract(m_b)}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def main():
|
| 26 |
+
if len(sys.argv) != 3:
|
| 27 |
+
print("Usage: python -m eval.compare_runs <run_dir_a> <run_dir_b>")
|
| 28 |
+
sys.exit(1)
|
| 29 |
+
result = compare(sys.argv[1], sys.argv[2])
|
| 30 |
+
print(json.dumps(result, indent=2))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
main()
|
eval/diagnostic_topk.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from data.catalog_loader import load_catalog
|
| 10 |
+
from data.train_loader import load_train
|
| 11 |
+
from recommenders.bm25 import BM25Recommender
|
| 12 |
+
from recommenders.vector_recommender import VectorRecommender
|
| 13 |
+
from recommenders.hybrid_rrf import HybridRRFRecommender, HybridRerankRecommender
|
| 14 |
+
from retrieval.vector_index import VectorIndex
|
| 15 |
+
from models.embedding_model import EmbeddingModel
|
| 16 |
+
from rerankers.cross_encoder import CrossEncoderReranker
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main():
|
| 20 |
+
parser = argparse.ArgumentParser(description="Diagnostics: positives coverage in top-N candidates and top-10 rerank.")
|
| 21 |
+
parser.add_argument("--catalog", default="data/catalog_docs.jsonl")
|
| 22 |
+
parser.add_argument("--train", required=True)
|
| 23 |
+
parser.add_argument("--vector-index", required=True)
|
| 24 |
+
parser.add_argument("--assessment-ids", required=True)
|
| 25 |
+
parser.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
|
| 26 |
+
parser.add_argument("--reranker-model", default="cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 27 |
+
parser.add_argument("--topn", type=int, default=200, help="Top-N candidates to inspect")
|
| 28 |
+
parser.add_argument("--rrf-k", type=int, default=60)
|
| 29 |
+
parser.add_argument("--output-dir", default="runs/diagnostic_topk")
|
| 30 |
+
args = parser.parse_args()
|
| 31 |
+
|
| 32 |
+
df_catalog, _, id_by_url = load_catalog(args.catalog)
|
| 33 |
+
with open(args.assessment_ids) as f:
|
| 34 |
+
ids = json.load(f)
|
| 35 |
+
index = VectorIndex.load(args.vector_index)
|
| 36 |
+
embed_model = EmbeddingModel(args.model)
|
| 37 |
+
vector_rec = VectorRecommender(embed_model, index, df_catalog, ids, k_candidates=args.topn)
|
| 38 |
+
bm25_rec = BM25Recommender(df_catalog)
|
| 39 |
+
hybrid = HybridRRFRecommender(bm25_rec, vector_rec, topn_candidates=args.topn, rrf_k=args.rrf_k)
|
| 40 |
+
reranker = CrossEncoderReranker(model_name=args.reranker_model)
|
| 41 |
+
hybrid_rerank = HybridRerankRecommender(bm25_rec, vector_rec, reranker, df_catalog, topn_candidates=args.topn, rrf_k=args.rrf_k)
|
| 42 |
+
|
| 43 |
+
examples, label_report = load_train(args.train, id_by_url)
|
| 44 |
+
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
|
| 45 |
+
Path(args.output_dir, "label_resolution_report.json").write_text(json.dumps(label_report, indent=2))
|
| 46 |
+
|
| 47 |
+
rows = []
|
| 48 |
+
coverage_fail = 0
|
| 49 |
+
zero_topn = 0
|
| 50 |
+
zero_top10 = 0
|
| 51 |
+
for ex in examples:
|
| 52 |
+
candidates = hybrid.recommend(ex.query, k=args.topn)
|
| 53 |
+
reranked = hybrid_rerank.recommend(ex.query, k=10)
|
| 54 |
+
pos_topn = len(set(candidates).intersection(ex.relevant_ids))
|
| 55 |
+
pos_top10 = len(set(reranked).intersection(ex.relevant_ids))
|
| 56 |
+
if pos_topn == 0:
|
| 57 |
+
zero_topn += 1
|
| 58 |
+
if pos_top10 == 0:
|
| 59 |
+
zero_top10 += 1
|
| 60 |
+
if pos_topn == 0:
|
| 61 |
+
coverage_fail += 1
|
| 62 |
+
rows.append(
|
| 63 |
+
{
|
| 64 |
+
"query": ex.query,
|
| 65 |
+
"relevant_ids": list(ex.relevant_ids),
|
| 66 |
+
"pos_in_topn": pos_topn,
|
| 67 |
+
"pos_in_top10": pos_top10,
|
| 68 |
+
"candidates": candidates,
|
| 69 |
+
"reranked_top10": reranked,
|
| 70 |
+
}
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
summary = {
|
| 74 |
+
"total_queries": len(examples),
|
| 75 |
+
"topn": args.topn,
|
| 76 |
+
"zero_pos_in_topn": zero_topn,
|
| 77 |
+
"zero_pos_in_top10": zero_top10,
|
| 78 |
+
"coverage_failures": coverage_fail,
|
| 79 |
+
"label_match_pct": label_report.get("matched_pct"),
|
| 80 |
+
}
|
| 81 |
+
with open(Path(args.output_dir) / "summary.json", "w") as f:
|
| 82 |
+
json.dump(summary, f, indent=2)
|
| 83 |
+
pd.DataFrame(rows).to_json(Path(args.output_dir) / "per_query.jsonl", orient="records", lines=True)
|
| 84 |
+
print(json.dumps(summary, indent=2))
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
main()
|
eval/metrics.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, List, Sequence, Set
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def recall_at_k(ground_truth: Set[str], preds: Sequence[str], k: int) -> float:
|
| 7 |
+
if not ground_truth:
|
| 8 |
+
return 0.0
|
| 9 |
+
topk = preds[:k]
|
| 10 |
+
hits = len(ground_truth.intersection(topk))
|
| 11 |
+
return hits / len(ground_truth)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def mrr_at_k(ground_truth: Set[str], preds: Sequence[str], k: int) -> float:
|
| 15 |
+
if not ground_truth:
|
| 16 |
+
return 0.0
|
| 17 |
+
for idx, pid in enumerate(preds[:k], start=1):
|
| 18 |
+
if pid in ground_truth:
|
| 19 |
+
return 1.0 / idx
|
| 20 |
+
return 0.0
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def mean_metric(queries: Iterable[Set[str]], preds_list: Iterable[Sequence[str]], fn, k: int) -> float:
|
| 24 |
+
scores = []
|
| 25 |
+
for g, p in zip(queries, preds_list):
|
| 26 |
+
scores.append(fn(g, p, k))
|
| 27 |
+
return sum(scores) / len(scores) if scores else 0.0
|
eval/run_eval.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
from data.catalog_loader import load_catalog
|
| 13 |
+
from data.train_loader import load_train, save_label_resolution_report
|
| 14 |
+
from eval.metrics import recall_at_k, mrr_at_k
|
| 15 |
+
from recommenders.dummy_random import DummyRandomRecommender
|
| 16 |
+
from recommenders.bm25 import BM25Recommender
|
| 17 |
+
from recommenders.vector_recommender import VectorRecommender
|
| 18 |
+
from recommenders.hybrid_rrf import HybridRRFRecommender, HybridRerankRecommender
|
| 19 |
+
from recommenders.hybrid_rrf_lgbm import HybridRRFLGBMRecommender
|
| 20 |
+
from retrieval.vector_index import VectorIndex
|
| 21 |
+
from models.embedding_model import EmbeddingModel
|
| 22 |
+
from rerankers.cross_encoder import CrossEncoderReranker
|
| 23 |
+
from rerankers.lgbm_reranker import LGBMReranker
|
| 24 |
+
from retrieval.query_rewriter import rewrite_query
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def split_examples(examples, val_ratio=0.2, seed=42):
|
| 28 |
+
import random
|
| 29 |
+
|
| 30 |
+
rnd = random.Random(seed)
|
| 31 |
+
shuffled = examples[:]
|
| 32 |
+
rnd.shuffle(shuffled)
|
| 33 |
+
cut = int(len(shuffled) * (1 - val_ratio))
|
| 34 |
+
return shuffled[:cut], shuffled[cut:]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_eval(catalog_path: str, train_path: str, recommender_name: str, out_dir: str, seed: int = 42):
|
| 38 |
+
df_catalog, catalog_by_id, id_by_url = load_catalog(catalog_path)
|
| 39 |
+
examples, label_report = load_train(train_path, id_by_url)
|
| 40 |
+
save_label_resolution_report(label_report, Path(out_dir) / "label_resolution_report.json")
|
| 41 |
+
|
| 42 |
+
train_split, val_split = split_examples(examples, val_ratio=0.2, seed=seed)
|
| 43 |
+
|
| 44 |
+
def make_recommender():
|
| 45 |
+
if recommender_name == "dummy_random":
|
| 46 |
+
return DummyRandomRecommender(df_catalog["assessment_id"].tolist(), seed=seed)
|
| 47 |
+
if recommender_name == "bm25":
|
| 48 |
+
return BM25Recommender(df_catalog)
|
| 49 |
+
if recommender_name == "vector":
|
| 50 |
+
# Expect doc_text present in df_catalog and provided index/ids/model via env/args; set below in main()
|
| 51 |
+
raise RuntimeError("Vector recommender should be constructed in main with index and ids.")
|
| 52 |
+
raise ValueError(f"Unknown recommender: {recommender_name}")
|
| 53 |
+
|
| 54 |
+
recommender = make_recommender()
|
| 55 |
+
|
| 56 |
+
def eval_split(split, split_name):
|
| 57 |
+
preds_list: List[List[str]] = []
|
| 58 |
+
gt_list: List[set] = []
|
| 59 |
+
rows = []
|
| 60 |
+
for ex in split:
|
| 61 |
+
preds_raw = recommender.recommend(ex.query, k=10)
|
| 62 |
+
preds = []
|
| 63 |
+
for pr in preds_raw:
|
| 64 |
+
if isinstance(pr, str):
|
| 65 |
+
preds.append(pr)
|
| 66 |
+
elif isinstance(pr, dict) and "assessment_id" in pr:
|
| 67 |
+
preds.append(pr["assessment_id"])
|
| 68 |
+
preds = preds[:10]
|
| 69 |
+
preds_list.append(preds)
|
| 70 |
+
gt_list.append(ex.relevant_ids)
|
| 71 |
+
hits = len(set(preds).intersection(ex.relevant_ids))
|
| 72 |
+
rows.append(
|
| 73 |
+
{
|
| 74 |
+
"query": ex.query,
|
| 75 |
+
"relevant_ids": list(ex.relevant_ids),
|
| 76 |
+
"predicted_ids": preds,
|
| 77 |
+
"hits": hits,
|
| 78 |
+
}
|
| 79 |
+
)
|
| 80 |
+
recall10 = sum(recall_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
|
| 81 |
+
recall5 = sum(recall_at_k(g, p, 5) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
|
| 82 |
+
mrr10 = sum(mrr_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
|
| 83 |
+
return recall10, recall5, mrr10, rows
|
| 84 |
+
|
| 85 |
+
train_r10, train_r5, train_mrr10, train_rows = eval_split(train_split, "train")
|
| 86 |
+
val_r10, val_r5, val_mrr10, val_rows = eval_split(val_split, "val")
|
| 87 |
+
|
| 88 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 89 |
+
metrics = {
|
| 90 |
+
"recommender": recommender_name,
|
| 91 |
+
"label_match_pct": label_report.get("matched_pct"),
|
| 92 |
+
"train": {"recall@10": train_r10, "recall@5": train_r5, "mrr@10": train_mrr10, "n": len(train_split)},
|
| 93 |
+
"val": {"recall@10": val_r10, "recall@5": val_r5, "mrr@10": val_mrr10, "n": len(val_split)},
|
| 94 |
+
}
|
| 95 |
+
with open(Path(out_dir) / "metrics.json", "w") as f:
|
| 96 |
+
json.dump(metrics, f, indent=2)
|
| 97 |
+
pd.DataFrame(train_rows + val_rows).to_json(Path(out_dir) / "per_query_results.jsonl", orient="records", lines=True)
|
| 98 |
+
worst = sorted(val_rows, key=lambda r: r["hits"])[:10]
|
| 99 |
+
pd.DataFrame(worst).to_csv(Path(out_dir) / "worst_queries.csv", index=False)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def main():
|
| 103 |
+
parser = argparse.ArgumentParser()
|
| 104 |
+
parser.add_argument("--catalog", default="data/catalog.jsonl")
|
| 105 |
+
parser.add_argument("--train", required=True)
|
| 106 |
+
parser.add_argument("--recommender", default="dummy_random")
|
| 107 |
+
parser.add_argument("--out-dir", default=None)
|
| 108 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 109 |
+
parser.add_argument("--vector-index", type=str, help="Path to FAISS index (for recommender=vector/hybrid_rrf)")
|
| 110 |
+
parser.add_argument("--assessment-ids", type=str, help="Path to assessment_ids.json aligned with embeddings/index")
|
| 111 |
+
parser.add_argument("--model", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Embedding model for vector recommender")
|
| 112 |
+
parser.add_argument("--topn-candidates", type=int, default=200, help="Top-N candidates to retrieve before fusion/rerank")
|
| 113 |
+
parser.add_argument("--rrf-k", type=int, default=60, help="RRF smoothing constant")
|
| 114 |
+
parser.add_argument("--reranker-model", type=str, default="cross-encoder/ms-marco-MiniLM-L-6-v2", help="Cross-encoder model for reranking")
|
| 115 |
+
parser.add_argument("--lgbm-model", type=str, help="Path to trained LGBM model (for hybrid_rrf_lgbm)")
|
| 116 |
+
parser.add_argument("--lgbm-features", type=str, help="Path to feature_schema.json for LGBM reranker")
|
| 117 |
+
parser.add_argument("--use-rewriter", action="store_true", help="Rewrite queries before retrieval/rerank.")
|
| 118 |
+
parser.add_argument("--vocab", type=str, help="Optional vocab JSON for rewriter boosts.")
|
| 119 |
+
args = parser.parse_args()
|
| 120 |
+
|
| 121 |
+
run_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
| 122 |
+
out_dir = args.out_dir or f"runs/{run_id}_{args.recommender}"
|
| 123 |
+
if args.recommender in {"vector", "hybrid_rrf", "hybrid_rrf_rerank", "hybrid_rrf_lgbm"}:
|
| 124 |
+
if not args.vector_index or not args.assessment_ids:
|
| 125 |
+
raise ValueError("Vector/hybrid recommender requires --vector-index and --assessment-ids")
|
| 126 |
+
df_catalog, _, id_by_url = load_catalog(args.catalog)
|
| 127 |
+
with open(args.assessment_ids) as f:
|
| 128 |
+
ids = json.load(f)
|
| 129 |
+
index = VectorIndex.load(args.vector_index)
|
| 130 |
+
embed_model = EmbeddingModel(args.model)
|
| 131 |
+
examples, label_report = load_train(args.train, id_by_url)
|
| 132 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 133 |
+
save_label_resolution_report(label_report, Path(out_dir) / "label_resolution_report.json")
|
| 134 |
+
vocab = {}
|
| 135 |
+
if args.use_rewriter and args.vocab:
|
| 136 |
+
with open(args.vocab) as f:
|
| 137 |
+
vocab = json.load(f)
|
| 138 |
+
|
| 139 |
+
train_split, val_split = split_examples(examples, val_ratio=0.2, seed=args.seed)
|
| 140 |
+
vector_rec = VectorRecommender(embed_model, index, df_catalog, ids, k_candidates=args.topn_candidates)
|
| 141 |
+
if args.recommender == "vector":
|
| 142 |
+
recommender = vector_rec
|
| 143 |
+
elif args.recommender == "hybrid_rrf":
|
| 144 |
+
bm25_rec = BM25Recommender(df_catalog)
|
| 145 |
+
recommender = HybridRRFRecommender(bm25_rec, vector_rec, topn_candidates=args.topn_candidates, rrf_k=args.rrf_k)
|
| 146 |
+
elif args.recommender == "hybrid_rrf_rerank":
|
| 147 |
+
bm25_rec = BM25Recommender(df_catalog)
|
| 148 |
+
reranker = CrossEncoderReranker(model_name=args.reranker_model)
|
| 149 |
+
recommender = HybridRerankRecommender(
|
| 150 |
+
bm25_rec,
|
| 151 |
+
vector_rec,
|
| 152 |
+
reranker,
|
| 153 |
+
df_catalog,
|
| 154 |
+
topn_candidates=args.topn_candidates,
|
| 155 |
+
rrf_k=args.rrf_k,
|
| 156 |
+
)
|
| 157 |
+
else:
|
| 158 |
+
if not args.lgbm_model or not args.lgbm_features:
|
| 159 |
+
raise ValueError("hybrid_rrf_lgbm requires --lgbm-model and --lgbm-features")
|
| 160 |
+
bm25_rec = BM25Recommender(df_catalog)
|
| 161 |
+
feature_cols = json.load(open(args.lgbm_features))
|
| 162 |
+
if isinstance(feature_cols, dict) and "features" in feature_cols:
|
| 163 |
+
feature_cols = feature_cols["features"]
|
| 164 |
+
recommender = HybridRRFLGBMRecommender(
|
| 165 |
+
bm25_rec,
|
| 166 |
+
vector_rec,
|
| 167 |
+
lgbm_model_path=args.lgbm_model,
|
| 168 |
+
feature_cols=feature_cols,
|
| 169 |
+
catalog_df=df_catalog,
|
| 170 |
+
topn_candidates=args.topn_candidates,
|
| 171 |
+
rrf_k=args.rrf_k,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
def eval_split(split, split_name):
|
| 175 |
+
preds_list = []
|
| 176 |
+
gt_list = []
|
| 177 |
+
rows = []
|
| 178 |
+
for ex in split:
|
| 179 |
+
retrieval_query = ex.query
|
| 180 |
+
rerank_query = ex.query
|
| 181 |
+
if args.use_rewriter:
|
| 182 |
+
rw = rewrite_query(ex.query, catalog_vocab=vocab)
|
| 183 |
+
retrieval_query = rw.retrieval_query
|
| 184 |
+
rerank_query = rw.rerank_query
|
| 185 |
+
if args.recommender == "hybrid_rrf_rerank":
|
| 186 |
+
preds_raw = recommender.recommend(retrieval_query, k=10, rerank_query=rerank_query)
|
| 187 |
+
else:
|
| 188 |
+
preds_raw = recommender.recommend(retrieval_query, k=10)
|
| 189 |
+
preds = []
|
| 190 |
+
for pr in preds_raw:
|
| 191 |
+
if isinstance(pr, str):
|
| 192 |
+
preds.append(pr)
|
| 193 |
+
elif isinstance(pr, dict) and "assessment_id" in pr:
|
| 194 |
+
preds.append(pr["assessment_id"])
|
| 195 |
+
preds = preds[:10]
|
| 196 |
+
preds_list.append(preds)
|
| 197 |
+
gt_list.append(ex.relevant_ids)
|
| 198 |
+
hits = len(set(preds).intersection(ex.relevant_ids))
|
| 199 |
+
rows.append(
|
| 200 |
+
{
|
| 201 |
+
"query": ex.query,
|
| 202 |
+
"relevant_ids": list(ex.relevant_ids),
|
| 203 |
+
"predicted_ids": preds,
|
| 204 |
+
"hits": hits,
|
| 205 |
+
}
|
| 206 |
+
)
|
| 207 |
+
recall10 = sum(recall_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
|
| 208 |
+
recall5 = sum(recall_at_k(g, p, 5) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
|
| 209 |
+
mrr10 = sum(mrr_at_k(g, p, 10) for g, p in zip(gt_list, preds_list)) / len(gt_list) if gt_list else 0.0
|
| 210 |
+
return recall10, recall5, mrr10, rows
|
| 211 |
+
|
| 212 |
+
train_r10, train_r5, train_mrr10, train_rows = eval_split(train_split, "train")
|
| 213 |
+
val_r10, val_r5, val_mrr10, val_rows = eval_split(val_split, "val")
|
| 214 |
+
metrics = {
|
| 215 |
+
"recommender": args.recommender,
|
| 216 |
+
"label_match_pct": label_report.get("matched_pct"),
|
| 217 |
+
"train": {"recall@10": train_r10, "recall@5": train_r5, "mrr@10": train_mrr10, "n": len(train_split)},
|
| 218 |
+
"val": {"recall@10": val_r10, "recall@5": val_r5, "mrr@10": val_mrr10, "n": len(val_split)},
|
| 219 |
+
"config": {
|
| 220 |
+
"topn_candidates": args.topn_candidates,
|
| 221 |
+
"rrf_k": args.rrf_k,
|
| 222 |
+
"model": args.model,
|
| 223 |
+
"index": args.vector_index,
|
| 224 |
+
},
|
| 225 |
+
}
|
| 226 |
+
with open(Path(out_dir) / "metrics.json", "w") as f:
|
| 227 |
+
json.dump(metrics, f, indent=2)
|
| 228 |
+
pd.DataFrame(train_rows + val_rows).to_json(Path(out_dir) / "per_query_results.jsonl", orient="records", lines=True)
|
| 229 |
+
worst = sorted(val_rows, key=lambda r: r["hits"])[:10]
|
| 230 |
+
pd.DataFrame(worst).to_csv(Path(out_dir) / "worst_queries.csv", index=False)
|
| 231 |
+
print(f"Run saved to {out_dir}")
|
| 232 |
+
else:
|
| 233 |
+
run_eval(args.catalog, args.train, args.recommender, out_dir, seed=args.seed)
|
| 234 |
+
print(f"Run saved to {out_dir}")
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
if __name__ == "__main__":
|
| 238 |
+
main()
|
frontend/.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
.env.*
|
| 3 |
+
.next
|
| 4 |
+
node_modules
|
| 5 |
+
npm-debug.log*
|
| 6 |
+
yarn-debug.log*
|
| 7 |
+
yarn-error.log*
|
| 8 |
+
.turbo
|
| 9 |
+
.vercel
|
frontend/Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM node:20-alpine AS deps
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY package*.json ./
|
| 4 |
+
RUN npm ci
|
| 5 |
+
|
| 6 |
+
FROM deps AS builder
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
COPY . .
|
| 9 |
+
RUN npm run build
|
| 10 |
+
|
| 11 |
+
FROM node:20-alpine AS runner
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
ENV NODE_ENV=production
|
| 14 |
+
COPY --from=builder /app/.next ./.next
|
| 15 |
+
COPY --from=builder /app/public ./public
|
| 16 |
+
COPY --from=builder /app/package*.json ./
|
| 17 |
+
RUN npm ci --omit=dev
|
| 18 |
+
EXPOSE 3000
|
| 19 |
+
CMD ["npm", "start"]
|
frontend/index.html
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>Assessment Recommender</title>
|
| 7 |
+
<style>
|
| 8 |
+
body { font-family: Arial, sans-serif; margin: 24px; background: #f7f7f7; }
|
| 9 |
+
.container { max-width: 960px; margin: 0 auto; background: #fff; padding: 20px; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.1); }
|
| 10 |
+
textarea, input { width: 100%; }
|
| 11 |
+
label { display: block; margin: 8px 0 4px; font-weight: 600; }
|
| 12 |
+
button { margin-top: 12px; padding: 10px 16px; cursor: pointer; }
|
| 13 |
+
pre { background: #111; color: #0f0; padding: 12px; border-radius: 4px; overflow: auto; max-height: 420px; }
|
| 14 |
+
.row { display: flex; gap: 8px; align-items: center; }
|
| 15 |
+
.row input[type="checkbox"] { width: auto; }
|
| 16 |
+
</style>
|
| 17 |
+
</head>
|
| 18 |
+
<body>
|
| 19 |
+
<div class="container">
|
| 20 |
+
<h2>Assessment Recommender</h2>
|
| 21 |
+
<label for="api">API base URL</label>
|
| 22 |
+
<input id="api" type="text" placeholder="http://localhost:8000" />
|
| 23 |
+
|
| 24 |
+
<label for="query">Query</label>
|
| 25 |
+
<textarea id="query" rows="4" placeholder="Enter your query..."></textarea>
|
| 26 |
+
|
| 27 |
+
<label for="clarification">Clarification (optional)</label>
|
| 28 |
+
<input id="clarification" type="text" placeholder="If a clarification question was asked, answer here" />
|
| 29 |
+
|
| 30 |
+
<div class="row">
|
| 31 |
+
<input id="verbose" type="checkbox" />
|
| 32 |
+
<label for="verbose" style="margin: 0; font-weight: 400;">Verbose (debug)</label>
|
| 33 |
+
</div>
|
| 34 |
+
|
| 35 |
+
<button id="submit">Submit</button>
|
| 36 |
+
|
| 37 |
+
<h3>Response</h3>
|
| 38 |
+
<pre id="output">Awaiting input...</pre>
|
| 39 |
+
</div>
|
| 40 |
+
|
| 41 |
+
<script type="module" src="/static/main.js"></script>
|
| 42 |
+
</body>
|
| 43 |
+
</html>
|
frontend/next-env.d.ts
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/// <reference types="next" />
|
| 2 |
+
/// <reference types="next/image-types/global" />
|
| 3 |
+
|
| 4 |
+
// NOTE: This file should not be edited
|
| 5 |
+
// see https://nextjs.org/docs/basic-features/typescript for more information.
|
frontend/next.config.mjs
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** @type {import('next').NextConfig} */
|
| 2 |
+
const nextConfig = {
|
| 3 |
+
reactStrictMode: false,
|
| 4 |
+
// Static export for hosting on static platforms (Render static site, etc.)
|
| 5 |
+
output: "export"
|
| 6 |
+
};
|
| 7 |
+
|
| 8 |
+
export default nextConfig;
|
frontend/out/404.html
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/_next/static/css/e2d6b4bec72e8797.css" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/_next/static/chunks/webpack-879f858537244e02.js"/><script src="/_next/static/chunks/fd9d1056-0eb575322ff5015c.js" async=""></script><script src="/_next/static/chunks/23-02b97631d99e6f05.js" async=""></script><script src="/_next/static/chunks/main-app-df951a18dbec0e17.js" async=""></script><title>404: This page could not be found.</title><title>SHL Assessment Recommender</title><meta name="description" content="Chat + recommendations UI powered by FastAPI backend"/><script src="/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js" noModule=""></script></head><body class="bg-slate-100"><div style="font-family:system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/_next/static/chunks/webpack-879f858537244e02.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/_next/static/css/e2d6b4bec72e8797.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"2:I[5751,[],\"\"]\n4:I[9275,[],\"\"]\n5:I[1343,[],\"\"]\nb:I[6130,[],\"\"]\n6:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n7:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\n8:{\"display\":\"inline-block\"}\n9:{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0}\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/_next/static/css/e2d6b4bec72e8797.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"$L2\",null,{\"buildId\":\"tvK2sxvsuv7CccL1KsVpv\",\"assetPrefix\":\"\",\"initialCanonicalUrl\":\"/_not-found\",\"initialTree\":[\"\",{\"children\":[\"/_not-found\",{\"children\":[\"__PAGE__\",{}]}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"/_not-found\",{\"children\":[\"__PAGE__\",{},[[\"$L3\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]]],null],null]},[\"$\",\"$L4\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"/_not-found\",\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L5\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\",\"styles\":null}],null]},[[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"bg-slate-100\",\"children\":[\"$\",\"$L4\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L5\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$6\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$7\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$9\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null],null],\"couldBeIntercepted\":false,\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"SHL Assessment Recommender\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Chat + recommendations UI powered by FastAPI backend\"}]]\n3:null\n"])</script></body></html>
|
frontend/out/_next/static/chunks/23-02b97631d99e6f05.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/out/_next/static/chunks/app/_not-found/page-a99a188ec9244b3f.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[409],{7589:function(e,t,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_not-found/page",function(){return n(5457)}])},5457:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),Object.defineProperty(t,"default",{enumerable:!0,get:function(){return s}}),n(9920);let i=n(7437);n(2265);let o={fontFamily:'system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji"',height:"100vh",textAlign:"center",display:"flex",flexDirection:"column",alignItems:"center",justifyContent:"center"},l={display:"inline-block"},r={display:"inline-block",margin:"0 20px 0 0",padding:"0 23px 0 0",fontSize:24,fontWeight:500,verticalAlign:"top",lineHeight:"49px"},d={fontSize:14,fontWeight:400,lineHeight:"49px",margin:0};function s(){return(0,i.jsxs)(i.Fragment,{children:[(0,i.jsx)("title",{children:"404: This page could not be found."}),(0,i.jsx)("div",{style:o,children:(0,i.jsxs)("div",{children:[(0,i.jsx)("style",{dangerouslySetInnerHTML:{__html:"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}),(0,i.jsx)("h1",{className:"next-error-h1",style:r,children:"404"}),(0,i.jsx)("div",{style:l,children:(0,i.jsx)("h2",{style:d,children:"This page could not be found."})})]})})]})}("function"==typeof t.default||"object"==typeof t.default&&null!==t.default)&&void 0===t.default.__esModule&&(Object.defineProperty(t.default,"__esModule",{value:!0}),Object.assign(t.default,t),e.exports=t.default)}},function(e){e.O(0,[971,23,744],function(){return e(e.s=7589)}),_N_E=e.O()}]);
|
frontend/out/_next/static/chunks/app/layout-fc95adeb217fd9c8.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{3711:function(n,e,u){Promise.resolve().then(u.t.bind(u,3054,23))},3054:function(){}},function(n){n.O(0,[141,971,23,744],function(){return n(n.s=3711)}),_N_E=n.O()}]);
|
frontend/out/_next/static/chunks/app/page-73ea6ec0ec8fa438.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[931],{5531:function(e,t,s){Promise.resolve().then(s.bind(s,9306))},9306:function(e,t,s){"use strict";s.r(t),s.d(t,{default:function(){return M}});var a=s(7437),l=s(2265);/**
|
| 2 |
+
* @license lucide-react v0.561.0 - ISC
|
| 3 |
+
*
|
| 4 |
+
* This source code is licensed under the ISC license.
|
| 5 |
+
* See the LICENSE file in the root directory of this source tree.
|
| 6 |
+
*/let r=e=>e.replace(/([a-z0-9])([A-Z])/g,"$1-$2").toLowerCase(),n=e=>e.replace(/^([A-Z])|[\s-_]+(\w)/g,(e,t,s)=>s?s.toUpperCase():t.toLowerCase()),i=e=>{let t=n(e);return t.charAt(0).toUpperCase()+t.slice(1)},o=function(){for(var e=arguments.length,t=Array(e),s=0;s<e;s++)t[s]=arguments[s];return t.filter((e,t,s)=>!!e&&""!==e.trim()&&s.indexOf(e)===t).join(" ").trim()},d=e=>{for(let t in e)if(t.startsWith("aria-")||"role"===t||"title"===t)return!0};/**
|
| 7 |
+
* @license lucide-react v0.561.0 - ISC
|
| 8 |
+
*
|
| 9 |
+
* This source code is licensed under the ISC license.
|
| 10 |
+
* See the LICENSE file in the root directory of this source tree.
|
| 11 |
+
*/var c={xmlns:"http://www.w3.org/2000/svg",width:24,height:24,viewBox:"0 0 24 24",fill:"none",stroke:"currentColor",strokeWidth:2,strokeLinecap:"round",strokeLinejoin:"round"};/**
|
| 12 |
+
* @license lucide-react v0.561.0 - ISC
|
| 13 |
+
*
|
| 14 |
+
* This source code is licensed under the ISC license.
|
| 15 |
+
* See the LICENSE file in the root directory of this source tree.
|
| 16 |
+
*/let u=(0,l.forwardRef)((e,t)=>{let{color:s="currentColor",size:a=24,strokeWidth:r=2,absoluteStrokeWidth:n,className:i="",children:u,iconNode:m,...x}=e;return(0,l.createElement)("svg",{ref:t,...c,width:a,height:a,stroke:s,strokeWidth:n?24*Number(r)/Number(a):r,className:o("lucide",i),...!u&&!d(x)&&{"aria-hidden":"true"},...x},[...m.map(e=>{let[t,s]=e;return(0,l.createElement)(t,s)}),...Array.isArray(u)?u:[u]])}),m=(e,t)=>{let s=(0,l.forwardRef)((s,a)=>{let{className:n,...d}=s;return(0,l.createElement)(u,{ref:a,iconNode:t,className:o("lucide-".concat(r(i(e))),"lucide-".concat(e),n),...d})});return s.displayName=i(e),s},x=m("refresh-cw",[["path",{d:"M3 12a9 9 0 0 1 9-9 9.75 9.75 0 0 1 6.74 2.74L21 8",key:"v9h5vc"}],["path",{d:"M21 3v5h-5",key:"1q7to0"}],["path",{d:"M21 12a9 9 0 0 1-9 9 9.75 9.75 0 0 1-6.74-2.74L3 16",key:"3uifl3"}],["path",{d:"M8 16H3v5",key:"1cv678"}]]),h=m("send",[["path",{d:"M14.536 21.686a.5.5 0 0 0 .937-.024l6.5-19a.496.496 0 0 0-.635-.635l-19 6.5a.5.5 0 0 0-.024.937l7.93 3.18a2 2 0 0 1 1.112 1.11z",key:"1ffxy3"}],["path",{d:"m21.854 2.147-10.94 10.939",key:"12cjpa"}]]),p=m("bug",[["path",{d:"M12 20v-9",key:"1qisl0"}],["path",{d:"M14 7a4 4 0 0 1 4 4v3a6 6 0 0 1-12 0v-3a4 4 0 0 1 4-4z",key:"uouzyp"}],["path",{d:"M14.12 3.88 16 2",key:"qol33r"}],["path",{d:"M21 21a4 4 0 0 0-3.81-4",key:"1b0z45"}],["path",{d:"M21 5a4 4 0 0 1-3.55 3.97",key:"5cxbf6"}],["path",{d:"M22 13h-4",key:"1jl80f"}],["path",{d:"M3 21a4 4 0 0 1 3.81-4",key:"1fjd4g"}],["path",{d:"M3 5a4 4 0 0 0 3.55 3.97",key:"1d7oge"}],["path",{d:"M6 13H2",key:"82j7cp"}],["path",{d:"m8 2 1.88 1.88",key:"fmnt4t"}],["path",{d:"M9 7.13V6a3 3 0 1 1 6 0v1.13",key:"1vgav8"}]]),v=m("settings",[["path",{d:"M9.671 4.136a2.34 2.34 0 0 1 4.659 0 2.34 2.34 0 0 0 3.319 1.915 2.34 2.34 0 0 1 2.33 4.033 2.34 2.34 0 0 0 0 3.831 2.34 2.34 0 0 1-2.33 4.033 2.34 2.34 0 0 0-3.319 1.915 2.34 2.34 0 0 1-4.659 0 2.34 2.34 0 0 0-3.32-1.915 2.34 2.34 0 0 1-2.33-4.033 2.34 2.34 0 0 0 0-3.831A2.34 2.34 0 0 1 6.35 6.051a2.34 2.34 0 0 0 3.319-1.915",key:"1i5ecw"}],["circle",{cx:"12",cy:"12",r:"3",key:"1v7zrd"}]]),f=m("funnel",[["path",{d:"M10 20a1 1 0 0 0 .553.895l2 1A1 1 0 0 0 14 21v-7a2 2 0 0 1 .517-1.341L21.74 4.67A1 1 0 0 0 21 3H3a1 1 0 0 0-.742 1.67l7.225 7.989A2 2 0 0 1 10 14z",key:"sc7q7i"}]]),g=m("search",[["path",{d:"m21 21-4.34-4.34",key:"14j7rj"}],["circle",{cx:"11",cy:"11",r:"8",key:"4ej97u"}]]),b=m("sliders-horizontal",[["path",{d:"M10 5H3",key:"1qgfaw"}],["path",{d:"M12 19H3",key:"yhmn1j"}],["path",{d:"M14 3v4",key:"1sua03"}],["path",{d:"M16 17v4",key:"1q0r14"}],["path",{d:"M21 12h-9",key:"1o4lsq"}],["path",{d:"M21 19h-5",key:"1rlt1p"}],["path",{d:"M21 5h-7",key:"1oszz2"}],["path",{d:"M8 10v4",key:"tgpxqk"}],["path",{d:"M8 12H3",key:"a7s4jb"}]]),j=m("link",[["path",{d:"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71",key:"1cjeqo"}],["path",{d:"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71",key:"19qd67"}]]);async function y(e,t,s){return w("".concat(e.replace(/\/$/,""),"/chat"),t,s)}async function N(e,t,s){return w("".concat(e.replace(/\/$/,""),"/recommend"),t,s)}async function w(e,t,s){let a=new AbortController,l=setTimeout(()=>a.abort(),3e4);try{let l=await fetch(e,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(t),signal:s||a.signal});if(!l.ok){let e=await l.text();throw Error("HTTP ".concat(l.status,": ").concat(e))}return l.json()}finally{clearTimeout(l)}}function k(e,t){let[s,a]=(0,l.useState)(t);return(0,l.useEffect)(()=>{try{let t=localStorage.getItem(e);null!==t&&a(JSON.parse(t))}catch(e){}},[]),(0,l.useEffect)(()=>{try{localStorage.setItem(e,JSON.stringify(s))}catch(e){}},[e,s]),[s,a]}let C=["Java dev + collaboration + 40 minutes","Sales graduate assessment for 60 minutes","Culture fit assessment for COO, 60 minutes"];function M(){var e,t,s;let[r,n]=k("api_base","http://localhost:8000"),[i,o]=k("mode","recommend"),[d,c]=k("verbose",!1),[u,m]=k("llm_model","Qwen/Qwen2.5-1.5B-Instruct"),[w,M]=(0,l.useState)(""),[S,A]=(0,l.useState)(""),[_,z]=(0,l.useState)([]),[L,q]=(0,l.useState)(!1),[E,O]=(0,l.useState)(null),[T,H]=(0,l.useState)({search:"",remote:"any",adaptive:"any",duration:"any",sort:"match"}),R=(0,l.useRef)(null);(0,l.useEffect)(()=>{_.length&&null===E&&O(_.length-1)},[_,E]);let D=null!==E?_[E]:null,I=(null==D?void 0:null===(e=D.response)||void 0===e?void 0:e.recommended_assessments)||(null==D?void 0:null===(t=D.response)||void 0===t?void 0:t.final_results)||[],J=null==D?void 0:null===(s=D.response)||void 0===s?void 0:s.debug,U=(0,l.useMemo)(()=>{let e=[...I],{search:t,remote:s,adaptive:a,duration:l,sort:r}=T;if(t.trim()){let s=t.toLowerCase();e=e.filter(e=>{var t,a,l;return(null===(t=e.name)||void 0===t?void 0:t.toLowerCase().includes(s))||(null===(a=e.description)||void 0===a?void 0:a.toLowerCase().includes(s))||(null===(l=e.test_type)||void 0===l?void 0:l.some(e=>e.toLowerCase().includes(s)))})}return"any"!==s&&(e=e.filter(e=>(e.remote_support||"").toLowerCase()===s.toLowerCase())),"any"!==a&&(e=e.filter(e=>(e.adaptive_support||"").toLowerCase()===a.toLowerCase())),"any"!==l&&(e=e.filter(e=>{let t=e.duration;return null==t?"unknown"===l:"<=20"===l?t<=20:"<=40"===l?t<=40:"<=60"!==l||t<=60})),"short"===r?e.sort((e,t)=>(e.duration||999)-(t.duration||999)):"adaptive"===r&&e.sort((e,t)=>("Yes"===t.adaptive_support?1:0)-("Yes"===e.adaptive_support?1:0)),e},[I,T]),P=async()=>{var e;if(!w.trim())return;q(!0),null===(e=R.current)||void 0===e||e.abort();let t=new AbortController;R.current=t;let s={query:w,verbose:d};S.trim()&&(s.clarification_answer=S.trim()),"recommend"===i&&u&&(s.llm_model=u);let a=crypto.randomUUID(),l=Date.now();z(e=>[...e,{id:a,query:w,response:null,ts:l}]);try{let e="chat"===i?await y(r,s,t.signal):await N(r,s,t.signal);z(t=>t.map(t=>t.id===a?{...t,response:e,error:void 0}:t)),O(_.length),M(""),A("")}catch(e){z(t=>t.map(t=>t.id===a?{...t,error:e.message}:t))}finally{q(!1)}},Y=(0,a.jsxs)("div",{className:"flex items-center justify-between mb-3",children:[(0,a.jsxs)("div",{children:[(0,a.jsx)("h1",{className:"text-3xl font-semibold text-slate-900",children:"SHL Assessment Recommender"}),(0,a.jsx)("p",{className:"text-sm text-slate-600",children:"Chat to get top-10 assessments. Filters and debug on the right."})]}),(0,a.jsxs)("div",{className:"hidden md:flex items-center gap-2 text-xs text-slate-500",children:[(0,a.jsx)(x,{size:16})," Live against FastAPI backend"]})]}),B=(0,a.jsxs)("div",{className:"flex flex-wrap gap-3 text-sm",children:[(0,a.jsxs)("div",{className:"flex items-center gap-2",children:[(0,a.jsx)("label",{className:"font-medium",children:"Mode"}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:i,onChange:e=>o(e.target.value),children:[(0,a.jsx)("option",{value:"recommend",children:"/recommend"}),(0,a.jsx)("option",{value:"chat",children:"/chat"})]})]}),(0,a.jsxs)("div",{className:"flex items-center gap-2",children:[(0,a.jsx)("label",{className:"font-medium",children:"LLM"}),(0,a.jsx)("input",{className:"border rounded px-2 py-1",value:u,onChange:e=>m(e.target.value),placeholder:"Qwen/Qwen2.5-1.5B-Instruct"})]}),(0,a.jsxs)("label",{className:"flex items-center gap-2",children:[(0,a.jsx)("input",{type:"checkbox",checked:d,onChange:e=>c(e.target.checked)}),"Verbose debug"]})]}),Q=(0,a.jsxs)("div",{className:"flex flex-col h-full",children:[(0,a.jsxs)("div",{className:"flex flex-col gap-3 flex-1 overflow-hidden bg-white border rounded-xl shadow-sm p-4",children:[(0,a.jsxs)("div",{className:"flex items-center justify-between",children:[(0,a.jsxs)("div",{className:"text-lg font-semibold flex items-center gap-2",children:[(0,a.jsx)(h,{size:18})," Chat"]}),(0,a.jsx)("button",{onClick:()=>{M(C[0])},className:"text-xs text-blue-600 hover:underline",children:"Use sample"})]}),(0,a.jsxs)("div",{className:"flex gap-2 items-center text-sm",children:[(0,a.jsx)("label",{className:"font-medium min-w-[70px]",children:"API base"}),(0,a.jsx)("input",{className:"border rounded px-2 py-1 w-full",value:r,onChange:e=>n(e.target.value)})]}),(0,a.jsx)("textarea",{className:"border rounded-lg p-3 w-full text-sm min-h-[140px] resize-none focus:ring-2 focus:ring-blue-200",placeholder:"Enter job description or query",value:w,onChange:e=>M(e.target.value),onKeyDown:e=>{"Enter"!==e.key||e.shiftKey||(e.preventDefault(),P())}}),(0,a.jsx)("div",{className:"flex gap-2",children:C.map(e=>(0,a.jsx)("button",{onClick:()=>M(e),className:"text-xs bg-slate-100 hover:bg-slate-200 px-2 py-1 rounded",children:e},e))}),(0,a.jsxs)("div",{className:"flex gap-3 items-center",children:[(0,a.jsx)("input",{className:"border rounded px-2 py-1 text-sm flex-1",placeholder:"Clarification (if asked)",value:S,onChange:e=>A(e.target.value)}),(0,a.jsxs)("button",{onClick:P,disabled:L,className:"bg-blue-600 text-white px-4 py-2 rounded-lg flex items-center gap-2 hover:bg-blue-700 disabled:opacity-60",children:[(0,a.jsx)(h,{size:16})," ",L?"Sending...":"Send"]}),(0,a.jsx)("button",{onClick:()=>c(!d),className:"p-2 border rounded-lg hover:bg-slate-100",title:"Toggle verbose debug",children:(0,a.jsx)(p,{size:16})}),(0,a.jsx)("button",{onClick:()=>o("recommend"===i?"chat":"recommend"),className:"p-2 border rounded-lg hover:bg-slate-100",title:"Toggle endpoint",children:(0,a.jsx)(v,{size:16})})]}),B]}),(0,a.jsxs)("div",{className:"mt-3 bg-white border rounded-xl shadow-sm p-3 text-sm text-slate-600 max-h-48 overflow-auto",children:[(0,a.jsx)("div",{className:"font-semibold mb-2",children:"History"}),0===_.length&&(0,a.jsx)("div",{className:"text-slate-400",children:"No queries yet."}),_.map((e,t)=>(0,a.jsxs)("button",{onClick:()=>O(t),className:"block w-full text-left px-2 py-1 rounded ".concat(t===E?"bg-blue-50 text-blue-700":"hover:bg-slate-100"),children:[(0,a.jsx)("div",{className:"font-medium text-sm truncate",children:e.query}),(0,a.jsx)("div",{className:"text-xs text-slate-500",children:new Date(e.ts).toLocaleTimeString()}),e.error&&(0,a.jsxs)("div",{className:"text-xs text-red-600",children:["Error: ",e.error]})]},e.id))]})]}),$=(0,a.jsxs)("div",{className:"flex flex-col h-full",children:[(0,a.jsxs)("div",{className:"bg-white border rounded-xl shadow-sm p-4 flex flex-col gap-3",children:[(0,a.jsxs)("div",{className:"flex items-center justify-between",children:[(0,a.jsxs)("div",{className:"text-lg font-semibold flex items-center gap-2",children:[(0,a.jsx)(f,{size:18})," Results"]}),(0,a.jsxs)("div",{className:"flex items-center gap-2",children:[(0,a.jsxs)("div",{className:"relative",children:[(0,a.jsx)(g,{className:"absolute left-2 top-2.5 h-4 w-4 text-slate-400"}),(0,a.jsx)("input",{className:"pl-8 pr-3 py-2 border rounded-lg text-sm",placeholder:"Search results",value:T.search,onChange:e=>H(t=>({...t,search:e.target.value}))})]}),(0,a.jsx)(b,{size:16,className:"text-slate-500"})]})]}),(0,a.jsxs)("div",{className:"flex flex-wrap gap-3 text-xs",children:[(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.remote,onChange:e=>H(t=>({...t,remote:e.target.value})),children:[(0,a.jsx)("option",{value:"any",children:"Remote: Any"}),(0,a.jsx)("option",{value:"Yes",children:"Remote: Yes"}),(0,a.jsx)("option",{value:"No",children:"Remote: No"})]}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.adaptive,onChange:e=>H(t=>({...t,adaptive:e.target.value})),children:[(0,a.jsx)("option",{value:"any",children:"Adaptive: Any"}),(0,a.jsx)("option",{value:"Yes",children:"Adaptive: Yes"}),(0,a.jsx)("option",{value:"No",children:"Adaptive: No"})]}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.duration,onChange:e=>H(t=>({...t,duration:e.target.value})),children:[(0,a.jsx)("option",{value:"any",children:"Duration: Any"}),(0,a.jsx)("option",{value:"<=20",children:"≤ 20 min"}),(0,a.jsx)("option",{value:"<=40",children:"≤ 40 min"}),(0,a.jsx)("option",{value:"<=60",children:"≤ 60 min"}),(0,a.jsx)("option",{value:"unknown",children:"Unknown only"})]}),(0,a.jsxs)("select",{className:"border rounded px-2 py-1",value:T.sort,onChange:e=>H(t=>({...t,sort:e.target.value})),children:[(0,a.jsx)("option",{value:"match",children:"Sort: Best match"}),(0,a.jsx)("option",{value:"short",children:"Sort: Shortest"}),(0,a.jsx)("option",{value:"adaptive",children:"Sort: Adaptive first"})]})]}),(0,a.jsxs)("div",{className:"grid md:grid-cols-2 lg:grid-cols-2 gap-3",children:[0===U.length&&(0,a.jsx)("div",{className:"text-sm text-slate-500",children:"No results yet. Submit a query to see recommendations."}),U.map((e,t)=>{var s;return(0,a.jsxs)("div",{className:"border rounded-xl p-4 shadow-sm hover:shadow-md transition bg-slate-50",children:[(0,a.jsxs)("div",{className:"flex items-start justify-between gap-2",children:[(0,a.jsx)("a",{href:e.url,target:"_blank",rel:"noreferrer",className:"font-semibold text-slate-900 hover:text-blue-600",children:e.name||"Untitled"}),(0,a.jsx)("button",{className:"text-slate-500 hover:text-blue-600",onClick:()=>e.url&&navigator.clipboard.writeText(e.url),children:(0,a.jsx)(j,{size:16})})]}),(0,a.jsxs)("div",{className:"flex flex-wrap gap-2 mt-2",children:[null===(s=e.test_type)||void 0===s?void 0:s.map(e=>(0,a.jsx)("span",{className:"text-[11px] bg-blue-50 text-blue-700 px-2 py-1 rounded-full border border-blue-100",children:e},e)),(0,a.jsx)("span",{className:"text-[11px] bg-slate-100 text-slate-700 px-2 py-1 rounded-full border border-slate-200",children:e.duration?"".concat(e.duration," min"):"Duration unknown"}),(0,a.jsxs)("span",{className:"text-[11px] bg-emerald-50 text-emerald-700 px-2 py-1 rounded-full border border-emerald-100",children:["Remote: ",e.remote_support||"?"]}),(0,a.jsxs)("span",{className:"text-[11px] bg-indigo-50 text-indigo-700 px-2 py-1 rounded-full border border-indigo-100",children:["Adaptive: ",e.adaptive_support||"?"]})]}),(0,a.jsx)("p",{className:"text-sm text-slate-700 mt-2 overflow-hidden text-ellipsis",children:e.description||"No description."})]},t)})]})]}),d&&J&&(0,a.jsxs)("div",{className:"mt-3 bg-white border rounded-xl shadow-sm p-4",children:[(0,a.jsxs)("div",{className:"flex items-center gap-2 text-sm font-semibold mb-2",children:[(0,a.jsx)(p,{size:16})," Debug"]}),(0,a.jsxs)("div",{className:"grid md:grid-cols-2 gap-3 text-xs",children:[(0,a.jsxs)("div",{className:"bg-slate-50 border rounded p-2",children:[(0,a.jsx)("div",{className:"font-semibold mb-1",children:"Plan"}),(0,a.jsx)("pre",{className:"overflow-auto max-h-48 text-slate-700",children:JSON.stringify(J.plan,null,2)})]}),J.fusion&&(0,a.jsxs)("div",{className:"bg-slate-50 border rounded p-2",children:[(0,a.jsx)("div",{className:"font-semibold mb-1",children:"Fusion"}),(0,a.jsx)("pre",{className:"overflow-auto max-h-48 text-slate-700",children:JSON.stringify(J.fusion,null,2)})]}),J.candidates&&(0,a.jsxs)("div",{className:"bg-slate-50 border rounded p-2 col-span-2",children:[(0,a.jsx)("div",{className:"font-semibold mb-1",children:"Top candidates"}),(0,a.jsx)("pre",{className:"overflow-auto max-h-60 text-slate-700",children:JSON.stringify(J.candidates,null,2)})]})]})]})]});return(0,a.jsx)("main",{className:"min-h-screen bg-slate-100",children:(0,a.jsxs)("div",{className:"app-shell py-6",children:[Y,(0,a.jsxs)("div",{className:"grid lg:grid-cols-2 gap-6 mt-4",children:[Q,$]})]})})}}},function(e){e.O(0,[971,23,744],function(){return e(e.s=5531)}),_N_E=e.O()}]);
|
frontend/out/_next/static/chunks/fd9d1056-0eb575322ff5015c.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/out/_next/static/chunks/framework-aec844d2ccbe7592.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/out/_next/static/chunks/main-app-df951a18dbec0e17.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{3332:function(e,n,t){Promise.resolve().then(t.t.bind(t,5751,23)),Promise.resolve().then(t.t.bind(t,6513,23)),Promise.resolve().then(t.t.bind(t,6130,23)),Promise.resolve().then(t.t.bind(t,9275,23)),Promise.resolve().then(t.t.bind(t,5324,23)),Promise.resolve().then(t.t.bind(t,1343,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,23],function(){return n(1028),n(3332)}),_N_E=e.O()}]);
|