Spaces:
Sleeping
Sleeping
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Configuración de evaluación para modelos OLMo-3 post-training | |
| # | |
| # Usado por: evaluate.py | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Directorio de salida para resultados | |
| results_dir: eval_results | |
| # GPU por defecto (vLLM) | |
| gpus: 1 | |
| # ── Suites para modelos instruct (SFT checkpoints) ────────────────────────── | |
| # Usan generación con vLLM. Requieren chat template (ya incluido en los HF checkpoints). | |
| suites: | |
| # TÜLU 3 (README oficial OLMES) | |
| tulu: | |
| description: "TÜLU 3 eval suite (dev + unseen)" | |
| model_type: vllm | |
| tasks: | |
| - tulu_3_dev | |
| - tulu_3_unseen | |
| # OLMo 3 instruct/chat (README oficial OLMES) | |
| olmo3_chat: | |
| description: "OLMo 3 instruct/chat suite" | |
| model_type: vllm | |
| tasks: | |
| - olmo3:adapt | |
| # ── OLMo 3 base (README oficial de OLMES) ───────────────────────────────── | |
| olmo3_base: | |
| description: "OLMo 3 base main (suite base principal)" | |
| model_type: vllm | |
| tasks: | |
| - olmo3:base:stem_qa_mc | |
| - olmo3:base:nonstem_qa_mc | |
| - olmo3:base:gen | |
| - olmo3:base:math | |
| - olmo3:base:code | |
| - olmo3:base:code_fim | |
| quick_check: | |
| description: "Quick completa: OLMES standard 10 MCQA" | |
| model_type: vllm | |
| tasks: | |
| - core_9mcqa::olmes | |
| - mmlu::olmes | |
| # ── Post-training experiments ────────────────────────────────────────────── | |
| post_training: | |
| description: "Post-training eval suite — capabilities + safety (EN, via OLMES)" | |
| model_type: vllm | |
| tasks: | |
| - mmlu:mc::olmes | |
| - hellaswag:mc::olmes | |
| - truthfulqa::tulu | |
| # Requiere OPENAI_API_KEY (external_eval=safety_eval). | |
| post_training_safety: | |
| description: "Post-training safety (EN, external safety-eval; requires OpenAI key)" | |
| model_type: vllm | |
| tasks: | |
| - do_anything_now::default | |
| - harmbench::default | |
| - trustllm_jailbreaktrigger::default | |
| - wildjailbreak::harmful | |
| - wildjailbreak::benign | |
| - wildguardtest::default | |
| - xstest::default | |
| - strongreject::logprobs | |
| - toxigen::tiny | |
| - wmdp::default | |
| post_training_es: | |
| description: "Post-training eval suite — capabilities + safety (ES, via lm-eval-harness)" | |
| model_type: vllm | |
| backend: lm_eval | |
| tasks: | |
| - m_mmlu_es | |
| - hellaswag_es | |
| - truthfulqa_es_mc1 | |
| # ── Modelos por defecto ────────────────────────────────────────────────────── | |
| defaults: | |
| instruct_model_dir: .runtime/hf_models/olmo3-7b-sft-instruct | |
| base_model: .runtime/models/OLMo-3-7B | |