# ───────────────────────────────────────────────────────────────────────────── # Configuración de evaluación para modelos OLMo-3 post-training # # Usado por: evaluate.py # ───────────────────────────────────────────────────────────────────────────── # Directorio de salida para resultados results_dir: eval_results # GPU por defecto (vLLM) gpus: 1 # ── Suites para modelos instruct (SFT checkpoints) ────────────────────────── # Usan generación con vLLM. Requieren chat template (ya incluido en los HF checkpoints). suites: # TÜLU 3 (README oficial OLMES) tulu: description: "TÜLU 3 eval suite (dev + unseen)" model_type: vllm tasks: - tulu_3_dev - tulu_3_unseen # OLMo 3 instruct/chat (README oficial OLMES) olmo3_chat: description: "OLMo 3 instruct/chat suite" model_type: vllm tasks: - olmo3:adapt # ── OLMo 3 base (README oficial de OLMES) ───────────────────────────────── olmo3_base: description: "OLMo 3 base main (suite base principal)" model_type: vllm tasks: - olmo3:base:stem_qa_mc - olmo3:base:nonstem_qa_mc - olmo3:base:gen - olmo3:base:math - olmo3:base:code - olmo3:base:code_fim quick_check: description: "Quick completa: OLMES standard 10 MCQA" model_type: vllm tasks: - core_9mcqa::olmes - mmlu::olmes # ── Post-training experiments ────────────────────────────────────────────── post_training: description: "Post-training eval suite — capabilities + safety (EN, via OLMES)" model_type: vllm tasks: - mmlu:mc::olmes - hellaswag:mc::olmes - truthfulqa::tulu # Requiere OPENAI_API_KEY (external_eval=safety_eval). post_training_safety: description: "Post-training safety (EN, external safety-eval; requires OpenAI key)" model_type: vllm tasks: - do_anything_now::default - harmbench::default - trustllm_jailbreaktrigger::default - wildjailbreak::harmful - wildjailbreak::benign - wildguardtest::default - xstest::default - strongreject::logprobs - toxigen::tiny - wmdp::default post_training_es: description: "Post-training eval suite — capabilities + safety (ES, via lm-eval-harness)" model_type: vllm backend: lm_eval tasks: - m_mmlu_es - hellaswag_es - truthfulqa_es_mc1 # ── Modelos por defecto ────────────────────────────────────────────────────── defaults: instruct_model_dir: .runtime/hf_models/olmo3-7b-sft-instruct base_model: .runtime/models/OLMo-3-7B