hf-papers / scripts /run_all_evals.sh
evalstate's picture
evalstate HF Staff
sync: promote hf_hub_community prompt v3 + add prompt/coverage harness
bba4fab verified
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
MODELS="${MODELS:-gpt-oss}"
ROUTER_AGENT="${ROUTER_AGENT:-hf_hub_community}"
ROUTER_AGENT_CARDS="${ROUTER_AGENT_CARDS:-$ROOT/.fast-agent/tool-cards}"
TIMEOUT="${TIMEOUT:-240}"
RUN_COMMUNITY="${RUN_COMMUNITY:-1}"
RUN_ROUTING="${RUN_ROUTING:-1}"
RUN_DESC_AB="${RUN_DESC_AB:-1}"
echo "[info] root=$ROOT"
echo "[info] models=$MODELS"
echo "[info] router_agent=$ROUTER_AGENT"
echo "[info] router_agent_cards=$ROUTER_AGENT_CARDS"
action() { echo; echo "========== $* =========="; }
if [[ "$RUN_COMMUNITY" == "1" ]]; then
action "HF Hub community challenge scoring"
python scripts/score_hf_hub_community_challenges.py --timeout "$TIMEOUT"
fi
if [[ "$RUN_ROUTING" == "1" ]]; then
action "Tool routing batch"
python scripts/run_tool_routing_batch.py \
--models "$MODELS" \
--agent "$ROUTER_AGENT" \
--agent-cards "$ROUTER_AGENT_CARDS" \
--timeout "$TIMEOUT"
fi
if [[ "$RUN_DESC_AB" == "1" ]]; then
action "Tool description A/B"
python scripts/eval_tool_description_ab.py \
--models "$MODELS" \
--timeout "$TIMEOUT"
action "Tool description plots + interpretation"
python scripts/plot_tool_description_eval.py
fi
action "Done"
echo "See docs/RESULTS.md for report locations."