PYTHON ?= /usr/local/opt/python@3.11/bin/python3.11 .PHONY: install test lint serve ingest ingest-k8s evaluate-fast evaluate-full benchmark evaluate-langchain calibrate evaluate-judges docker modal-deploy modal-stop vllm-up benchmark-all k8s-dev k8s-prod tf-plan tf-validate install: $(PYTHON) -m pip install -e ".[dev]" test: $(PYTHON) -m pytest tests/ -v --tb=short lint: ruff check agent_bench/ tests/ ruff format --check agent_bench/ tests/ mypy agent_bench/ --ignore-missing-imports serve: $(PYTHON) -m uvicorn agent_bench.serving.app:create_app --factory --reload --port 8000 ingest: $(PYTHON) scripts/ingest.py --config configs/tasks/tech_docs.yaml ingest-k8s: ## Ingest Kubernetes docs into .cache/store_k8s $(PYTHON) scripts/ingest.py --doc-dir data/k8s_docs --store-path .cache/store_k8s evaluate-fast: $(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode deterministic evaluate-full: $(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode full benchmark: $(PYTHON) scripts/benchmark.py --output docs/benchmark_report.md evaluate-langchain: $(PYTHON) scripts/run_langchain_eval.py --provider openai calibrate: ## Run full calibration pipeline (system outputs → all rows → strict κ table). Costs ~$2 in API calls. $(PYTHON) scripts/run_calibration.py generate-outputs @for cfg in configs/calibration/rows/*.yaml; do \ echo "==> running judges for $$cfg"; \ $(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \ done $(PYTHON) scripts/run_calibration.py build-table --strict evaluate-judges: ## Re-run all rows + build-table against existing system_outputs (no regeneration). Costs ~$1. @for cfg in configs/calibration/rows/*.yaml; do \ echo "==> running judges for $$cfg"; \ $(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \ done $(PYTHON) scripts/run_calibration.py build-table --strict docker: docker-compose -f docker/docker-compose.yaml up --build ## --- Infrastructure --- modal-deploy: ## Deploy vLLM on Modal (prints endpoint URL) @command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; } modal deploy modal/serve_vllm.py modal-stop: ## Stop Modal deployment @command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; } modal app stop agent-bench-vllm vllm-up: ## Start local vLLM via Docker Compose (requires NVIDIA GPU) docker compose -f docker/docker-compose.vllm.yml up --build benchmark-all: ## Run provider comparison (requires Modal deployment + API keys) $(PYTHON) modal/run_benchmark.py --base-url $(MODAL_VLLM_URL) k8s-dev: ## Deploy to minikube (dev values, set MODAL_VLLM_URL first) @test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1) helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-dev.yaml \ --set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL) k8s-prod: ## Deploy via Helm (prod values, set MODAL_VLLM_URL first) @test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1) helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-prod.yaml \ --set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL) tf-plan: ## Run terraform plan (no apply) cd terraform && terraform plan tf-validate: ## Validate terraform syntax cd terraform && terraform validate