Spaces:
Sleeping
Sleeping
| PYTHON ?= /usr/local/opt/python@3.11/bin/python3.11 | |
| .PHONY: install test lint serve ingest ingest-k8s evaluate-fast evaluate-full benchmark evaluate-langchain calibrate evaluate-judges docker modal-deploy modal-stop vllm-up benchmark-all k8s-dev k8s-prod tf-plan tf-validate | |
| install: | |
| $(PYTHON) -m pip install -e ".[dev]" | |
| test: | |
| $(PYTHON) -m pytest tests/ -v --tb=short | |
| lint: | |
| ruff check agent_bench/ tests/ | |
| ruff format --check agent_bench/ tests/ | |
| mypy agent_bench/ --ignore-missing-imports | |
| serve: | |
| $(PYTHON) -m uvicorn agent_bench.serving.app:create_app --factory --reload --port 8000 | |
| ingest: | |
| $(PYTHON) scripts/ingest.py --config configs/tasks/tech_docs.yaml | |
| ingest-k8s: ## Ingest Kubernetes docs into .cache/store_k8s | |
| $(PYTHON) scripts/ingest.py --doc-dir data/k8s_docs --store-path .cache/store_k8s | |
| evaluate-fast: | |
| $(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode deterministic | |
| evaluate-full: | |
| $(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode full | |
| benchmark: | |
| $(PYTHON) scripts/benchmark.py --output docs/benchmark_report.md | |
| evaluate-langchain: | |
| $(PYTHON) scripts/run_langchain_eval.py --provider openai | |
| calibrate: ## Run full calibration pipeline (system outputs → all rows → strict κ table). Costs ~$2 in API calls. | |
| $(PYTHON) scripts/run_calibration.py generate-outputs | |
| @for cfg in configs/calibration/rows/*.yaml; do \ | |
| echo "==> running judges for $$cfg"; \ | |
| $(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \ | |
| done | |
| $(PYTHON) scripts/run_calibration.py build-table --strict | |
| evaluate-judges: ## Re-run all rows + build-table against existing system_outputs (no regeneration). Costs ~$1. | |
| @for cfg in configs/calibration/rows/*.yaml; do \ | |
| echo "==> running judges for $$cfg"; \ | |
| $(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \ | |
| done | |
| $(PYTHON) scripts/run_calibration.py build-table --strict | |
| docker: | |
| docker-compose -f docker/docker-compose.yaml up --build | |
| ## --- Infrastructure --- | |
| modal-deploy: ## Deploy vLLM on Modal (prints endpoint URL) | |
| @command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; } | |
| modal deploy modal/serve_vllm.py | |
| modal-stop: ## Stop Modal deployment | |
| @command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; } | |
| modal app stop agent-bench-vllm | |
| vllm-up: ## Start local vLLM via Docker Compose (requires NVIDIA GPU) | |
| docker compose -f docker/docker-compose.vllm.yml up --build | |
| benchmark-all: ## Run provider comparison (requires Modal deployment + API keys) | |
| $(PYTHON) modal/run_benchmark.py --base-url $(MODAL_VLLM_URL) | |
| k8s-dev: ## Deploy to minikube (dev values, set MODAL_VLLM_URL first) | |
| @test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1) | |
| helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-dev.yaml \ | |
| --set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL) | |
| k8s-prod: ## Deploy via Helm (prod values, set MODAL_VLLM_URL first) | |
| @test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1) | |
| helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-prod.yaml \ | |
| --set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL) | |
| tf-plan: ## Run terraform plan (no apply) | |
| cd terraform && terraform plan | |
| tf-validate: ## Validate terraform syntax | |
| cd terraform && terraform validate | |