Spaces:

Nomearod
/

agentbench

Sleeping

App Files Files Community

agentbench / Makefile

Nomearod

docs+build: judge-layer v1 coupled-artifact updates

508e5ef 9 days ago

raw

history blame contribute delete

3.5 kB

	PYTHON ?= /usr/local/opt/python@3.11/bin/python3.11

	.PHONY: install test lint serve ingest ingest-k8s evaluate-fast evaluate-full benchmark evaluate-langchain calibrate evaluate-judges docker modal-deploy modal-stop vllm-up benchmark-all k8s-dev k8s-prod tf-plan tf-validate

	install:
	$(PYTHON) -m pip install -e ".[dev]"

	test:
	$(PYTHON) -m pytest tests/ -v --tb=short

	lint:
	ruff check agent_bench/ tests/
	ruff format --check agent_bench/ tests/
	mypy agent_bench/ --ignore-missing-imports

	serve:
	$(PYTHON) -m uvicorn agent_bench.serving.app:create_app --factory --reload --port 8000

	ingest:
	$(PYTHON) scripts/ingest.py --config configs/tasks/tech_docs.yaml

	ingest-k8s: ## Ingest Kubernetes docs into .cache/store_k8s
	$(PYTHON) scripts/ingest.py --doc-dir data/k8s_docs --store-path .cache/store_k8s

	evaluate-fast:
	$(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode deterministic

	evaluate-full:
	$(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode full

	benchmark:
	$(PYTHON) scripts/benchmark.py --output docs/benchmark_report.md

	evaluate-langchain:
	$(PYTHON) scripts/run_langchain_eval.py --provider openai

	calibrate: ## Run full calibration pipeline (system outputs → all rows → strict κ table). Costs ~$2 in API calls.
	$(PYTHON) scripts/run_calibration.py generate-outputs
	@for cfg in configs/calibration/rows/*.yaml; do \
	echo "==> running judges for $$cfg"; \
	$(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg \|\| exit 1; \
	done
	$(PYTHON) scripts/run_calibration.py build-table --strict

	evaluate-judges: ## Re-run all rows + build-table against existing system_outputs (no regeneration). Costs ~$1.
	@for cfg in configs/calibration/rows/*.yaml; do \
	echo "==> running judges for $$cfg"; \
	$(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg \|\| exit 1; \
	done
	$(PYTHON) scripts/run_calibration.py build-table --strict

	docker:
	docker-compose -f docker/docker-compose.yaml up --build

	## --- Infrastructure ---

	modal-deploy: ## Deploy vLLM on Modal (prints endpoint URL)
	@command -v modal >/dev/null 2>&1 \|\| { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; }
	modal deploy modal/serve_vllm.py

	modal-stop: ## Stop Modal deployment
	@command -v modal >/dev/null 2>&1 \|\| { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; }
	modal app stop agent-bench-vllm

	vllm-up: ## Start local vLLM via Docker Compose (requires NVIDIA GPU)
	docker compose -f docker/docker-compose.vllm.yml up --build

	benchmark-all: ## Run provider comparison (requires Modal deployment + API keys)
	$(PYTHON) modal/run_benchmark.py --base-url $(MODAL_VLLM_URL)

	k8s-dev: ## Deploy to minikube (dev values, set MODAL_VLLM_URL first)
	@test -n "$(MODAL_VLLM_URL)" \|\| (echo "Error: MODAL_VLLM_URL is not set" && exit 1)
	helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-dev.yaml \
	--set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL)

	k8s-prod: ## Deploy via Helm (prod values, set MODAL_VLLM_URL first)
	@test -n "$(MODAL_VLLM_URL)" \|\| (echo "Error: MODAL_VLLM_URL is not set" && exit 1)
	helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-prod.yaml \
	--set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL)

	tf-plan: ## Run terraform plan (no apply)
	cd terraform && terraform plan

	tf-validate: ## Validate terraform syntax
	cd terraform && terraform validate