Spaces:

Nomearod
/

agentbench

Sleeping

File size: 3,501 Bytes

9d976db
 
508e5ef
ef5d585
 
9d976db
ef5d585
 
9d976db
ef5d585
 
 
 
 
 
 
9d976db
ef5d585
 
9d976db
ef5d585
3c0089e
 
 
ef5d585
9d976db
ef5d585
 
9d976db
ef5d585
 
9d976db
ef5d585
9f98da1
 
 
508e5ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef5d585
 
a9d4375

PYTHON ?= /usr/local/opt/python@3.11/bin/python3.11

.PHONY: install test lint serve ingest ingest-k8s evaluate-fast evaluate-full benchmark evaluate-langchain calibrate evaluate-judges docker modal-deploy modal-stop vllm-up benchmark-all k8s-dev k8s-prod tf-plan tf-validate

install:
	$(PYTHON) -m pip install -e ".[dev]"

test:
	$(PYTHON) -m pytest tests/ -v --tb=short

lint:
	ruff check agent_bench/ tests/
	ruff format --check agent_bench/ tests/
	mypy agent_bench/ --ignore-missing-imports

serve:
	$(PYTHON) -m uvicorn agent_bench.serving.app:create_app --factory --reload --port 8000

ingest:
	$(PYTHON) scripts/ingest.py --config configs/tasks/tech_docs.yaml

ingest-k8s:  ## Ingest Kubernetes docs into .cache/store_k8s
	$(PYTHON) scripts/ingest.py --doc-dir data/k8s_docs --store-path .cache/store_k8s

evaluate-fast:
	$(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode deterministic

evaluate-full:
	$(PYTHON) scripts/evaluate.py --config configs/default.yaml --mode full

benchmark:
	$(PYTHON) scripts/benchmark.py --output docs/benchmark_report.md

evaluate-langchain:
	$(PYTHON) scripts/run_langchain_eval.py --provider openai

calibrate:  ## Run full calibration pipeline (system outputs → all rows → strict κ table). Costs ~$2 in API calls.
	$(PYTHON) scripts/run_calibration.py generate-outputs
	@for cfg in configs/calibration/rows/*.yaml; do \
		echo "==> running judges for $$cfg"; \
		$(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \
	done
	$(PYTHON) scripts/run_calibration.py build-table --strict

evaluate-judges:  ## Re-run all rows + build-table against existing system_outputs (no regeneration). Costs ~$1.
	@for cfg in configs/calibration/rows/*.yaml; do \
		echo "==> running judges for $$cfg"; \
		$(PYTHON) scripts/run_calibration.py run-judges --row-config=$$cfg || exit 1; \
	done
	$(PYTHON) scripts/run_calibration.py build-table --strict

docker:
	docker-compose -f docker/docker-compose.yaml up --build

## --- Infrastructure ---

modal-deploy:  ## Deploy vLLM on Modal (prints endpoint URL)
	@command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; }
	modal deploy modal/serve_vllm.py

modal-stop:  ## Stop Modal deployment
	@command -v modal >/dev/null 2>&1 || { echo "Error: modal CLI not found. Run: pip install -e '.[modal]' && modal setup"; exit 1; }
	modal app stop agent-bench-vllm

vllm-up:  ## Start local vLLM via Docker Compose (requires NVIDIA GPU)
	docker compose -f docker/docker-compose.vllm.yml up --build

benchmark-all:  ## Run provider comparison (requires Modal deployment + API keys)
	$(PYTHON) modal/run_benchmark.py --base-url $(MODAL_VLLM_URL)

k8s-dev:  ## Deploy to minikube (dev values, set MODAL_VLLM_URL first)
	@test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1)
	helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-dev.yaml \
		--set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL)

k8s-prod:  ## Deploy via Helm (prod values, set MODAL_VLLM_URL first)
	@test -n "$(MODAL_VLLM_URL)" || (echo "Error: MODAL_VLLM_URL is not set" && exit 1)
	helm install agent-bench k8s/helm/agent-bench/ -f k8s/helm/agent-bench/values-prod.yaml \
		--set provider.selfhosted.modalEndpoint=$(MODAL_VLLM_URL)

tf-plan:  ## Run terraform plan (no apply)
	cd terraform && terraform plan

tf-validate:  ## Validate terraform syntax
	cd terraform && terraform validate