| import verifiers as vf |
|
|
| """ |
| # install |
| vf-install complex-json-output (-p /path/to/environments) |
| |
| # quick eval |
| vf-eval complex-json-output (-m model_name in endpoints.py) |
| |
| inference: |
| CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 vf-vllm --model Qwen/Qwen2.5-1.5B-Instruct \ |
| --data-parallel-size 6 --enforce-eager --disable-log-requests |
| |
| training: |
| CUDA_VISIBLE_DEVICES=6,7 accelerate launch --num-processes 2 \ |
| --config-file configs/zero3.yaml examples/grpo/train_complex_json_output.py |
| """ |
|
|
| |
| HPARAMS = [ |
| "per_device_train_batch_size", |
| "num_generations", |
| "gradient_accumulation_steps", |
| "max_tokens", |
| "max_seq_len", |
| "max_prompt_length", |
| "max_completion_length", |
| "temperature", |
| "learning_rate", |
| "max_steps", |
| "warmup_steps", |
| "eval_steps", |
| "save_steps", |
| "beta", |
| "loss_type", |
| ] |
|
|
| |
| vf_env = vf.load_environment( |
| env_id="complex-json-output", |
| num_train_examples=8000, |
| num_eval_examples=50 |
| ) |
|
|
| |
| model_name = "/raid/workspace/Mango/verifiers/MS3.2-0.35-Beta" |
| run_name = "complex-json-grpo_" + model_name.split("/")[-1].lower() |
|
|
| |
| model, tokenizer = vf.get_model_and_tokenizer(model_name) |
|
|
| |
| training_args = vf.grpo_defaults(run_name=run_name) |
|
|
| |
| training_args.per_device_train_batch_size = 2 |
| training_args.num_generations = 16 |
| training_args.gradient_accumulation_steps = 2 |
|
|
| |
| training_args.max_tokens = 2048 |
| training_args.max_seq_len = 16000 |
| training_args.max_prompt_length = 8192 |
| training_args.max_completion_length = 4096 |
| training_args.temperature = 0.1 |
|
|
| |
| training_args.learning_rate = 5e-6 |
| training_args.max_steps = 1000 |
| training_args.warmup_steps = 15 |
|
|
| |
| training_args.eval_strategy = "none" |
| training_args.eval_steps = 50 |
| training_args.per_device_eval_batch_size = 8 |
|
|
| |
| training_args.save_strategy = "steps" |
| training_args.save_steps = 100 |
|
|
| |
| training_args.beta = 0.001 |
| training_args.loss_type = "dr_grpo" |
|
|
| |
| training_args.logging_steps = 1 |
| training_args.log_completions = True |
| training_args.num_completions_to_print = 3 |
| training_args.report_to = "wandb" |
|
|
| |
| trainer = vf.GRPOTrainer( |
| model=model, |
| processing_class=tokenizer, |
| env=vf_env, |
| args=training_args, |
| peft_config=vf.lora_defaults(r=8, alpha=16), |
| ) |
|
|
| |
| trainer.train() |
|
|