eval-runner / eval.yaml
ouhenio's picture
Bundle eval.yaml, fallback when GitHub repo is private
2c70552
# ─────────────────────────────────────────────────────────────────────────────
# Configuración de evaluación para modelos OLMo-3 post-training
#
# Usado por: evaluate.py
# ─────────────────────────────────────────────────────────────────────────────
# Directorio de salida para resultados
results_dir: eval_results
# GPU por defecto (vLLM)
gpus: 1
# ── Suites para modelos instruct (SFT checkpoints) ──────────────────────────
# Usan generación con vLLM. Requieren chat template (ya incluido en los HF checkpoints).
suites:
# TÜLU 3 (README oficial OLMES)
tulu:
description: "TÜLU 3 eval suite (dev + unseen)"
model_type: vllm
tasks:
- tulu_3_dev
- tulu_3_unseen
# OLMo 3 instruct/chat (README oficial OLMES)
olmo3_chat:
description: "OLMo 3 instruct/chat suite"
model_type: vllm
tasks:
- olmo3:adapt
# ── OLMo 3 base (README oficial de OLMES) ─────────────────────────────────
olmo3_base:
description: "OLMo 3 base main (suite base principal)"
model_type: vllm
tasks:
- olmo3:base:stem_qa_mc
- olmo3:base:nonstem_qa_mc
- olmo3:base:gen
- olmo3:base:math
- olmo3:base:code
- olmo3:base:code_fim
quick_check:
description: "Quick completa: OLMES standard 10 MCQA"
model_type: vllm
tasks:
- core_9mcqa::olmes
- mmlu::olmes
# ── Post-training experiments ──────────────────────────────────────────────
post_training:
description: "Post-training eval suite — capabilities + safety (EN, via OLMES)"
model_type: vllm
tasks:
- mmlu:mc::olmes
- hellaswag:mc::olmes
- truthfulqa::tulu
# Requiere OPENAI_API_KEY (external_eval=safety_eval).
post_training_safety:
description: "Post-training safety (EN, external safety-eval; requires OpenAI key)"
model_type: vllm
tasks:
- do_anything_now::default
- harmbench::default
- trustllm_jailbreaktrigger::default
- wildjailbreak::harmful
- wildjailbreak::benign
- wildguardtest::default
- xstest::default
- strongreject::logprobs
- toxigen::tiny
- wmdp::default
post_training_es:
description: "Post-training eval suite — capabilities + safety (ES, via lm-eval-harness)"
model_type: vllm
backend: lm_eval
tasks:
- m_mmlu_es
- hellaswag_es
- truthfulqa_es_mc1
# ── Modelos por defecto ──────────────────────────────────────────────────────
defaults:
instruct_model_dir: .runtime/hf_models/olmo3-7b-sft-instruct
base_model: .runtime/models/OLMo-3-7B