agentbench / Makefile
Nomearod's picture
docs+build: judge-layer v1 coupled-artifact updates
508e5ef
PYTHON ?= /usr/local/opt/python@3.11/bin/python3.11
.PHONY: install test lint serve ingest ingest-k8s evaluate-fast evaluate-full benchmark evaluate-langchain calibrate evaluate-judges docker modal-deploy modal-stop vllm-up benchmark-all k8s-dev k8s-prod tf-plan tf-validate
install:
$(PYTHON) -m pip install -e ".[dev]"
test:
$(PYTHON) -m pytest tests/ -v --tb=short
lint:
ruff check agent_bench/ tests/
ruff format --check agent_bench/ tests/
mypy agent_bench/ --ignore-missing-imports
serve:
$(PYTHON) -m uvicorn agent_bench.serving.app:create_app --factory --reload --port 8000
ingest:
$(PYTHON) scripts/ingest.py --config configs/tasks/tech_docs.yaml
ingest-k8s: ## Ingest Kubernetes docs into .cache/store_k8s
$(PYTHON) scripts/ingest.py --doc-dir data/k8s_docs --store-path .cache/store_k8s
evaluate-fast:
$(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode deterministic
evaluate-full:
$(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode full
benchmark:
$(PYTHON) scripts/benchmark.py --output docs/benchmark_report.md
evaluate-langchain:
$(PYTHON) scripts/run_langchain_eval.py --provider openai
calibrate: ## Run full calibration pipeline (system outputs → all rows → strict κ table). Costs ~$2 in API calls.
$(PYTHON) scripts/run_calibration.py generate-outputs
@for cfg in configs/calibration/rows/*.yaml; do \
echo "==> running judges for $$cfg"; \
$(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \
done
$(PYTHON) scripts/run_calibration.py build-table --strict
evaluate-judges: ## Re-run all rows + build-table against existing system_outputs (no regeneration). Costs ~$1.
@for cfg in configs/calibration/rows/*.yaml; do \
echo "==> running judges for $$cfg"; \
$(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \
done
$(PYTHON) scripts/run_calibration.py build-table --strict
docker:
docker-compose -f docker/docker-compose.yaml up --build
## --- Infrastructure ---
modal-deploy: ## Deploy vLLM on Modal (prints endpoint URL)
@command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; }
modal deploy modal/serve_vllm.py
modal-stop: ## Stop Modal deployment
@command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; }
modal app stop agent-bench-vllm
vllm-up: ## Start local vLLM via Docker Compose (requires NVIDIA GPU)
docker compose -f docker/docker-compose.vllm.yml up --build
benchmark-all: ## Run provider comparison (requires Modal deployment + API keys)
$(PYTHON) modal/run_benchmark.py --base-url $(MODAL_VLLM_URL)
k8s-dev: ## Deploy to minikube (dev values, set MODAL_VLLM_URL first)
@test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1)
helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-dev.yaml \
--set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL)
k8s-prod: ## Deploy via Helm (prod values, set MODAL_VLLM_URL first)
@test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1)
helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-prod.yaml \
--set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL)
tf-plan: ## Run terraform plan (no apply)
cd terraform && terraform plan
tf-validate: ## Validate terraform syntax
cd terraform && terraform validate