diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-39-41/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-39-41/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-41-59/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-41-59/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-57-12/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-57-12/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..599a58a76524a82196375a89e9799deffb783d12 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml @@ -0,0 +1,206 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=512 + - data.max_response_length=768 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml @@ -0,0 +1,39 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml @@ -0,0 +1,39 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98a95643027e6e5d887350f3fccc85b2b641c1f3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml @@ -0,0 +1,206 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=512 + - data.max_response_length=768 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml @@ -0,0 +1,39 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0711f3205e7405a13d0858ade535b3876b220f59 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml @@ -0,0 +1,206 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=512 + - data.max_response_length=768 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml @@ -0,0 +1,39 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88e37e42277bb0281f7da13d66247969dbd034ff --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml @@ -0,0 +1,206 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=512 + - data.max_response_length=768 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml @@ -0,0 +1,39 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..28c4b07fe714adcc30149fab377385e364c43241 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml @@ -0,0 +1,206 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=512 + - data.max_response_length=768 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml @@ -0,0 +1,39 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..441b2dd2afa248d401a083ced38f027dfd127e46 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 512 + max_response_length: 768 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6163aa3d72edcfbbfc0f1a43ffb904357d97cb39 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml @@ -0,0 +1,207 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=512 + - data.max_response_length=768 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..811e1d8b3d29e0b89d31c92531bf81ed525d9a0f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml @@ -0,0 +1,40 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=512 +- data.max_response_length=768 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d937ef596c29a47778876a523029e45331da7fa --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml @@ -0,0 +1,648 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c47d6b2f2d6efce57a40efef1ce4a49a555b9fe --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..829134df85779cad5d4138968c1f8e9b0476bf65 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4bdcd68b3620e08120db72f9637be2ff8ed0f428 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=8 + - data.max_prompt_length=256 + - data.max_response_length=256 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=4 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=1 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=256,data.max_response_length=256,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f06a900c99d8943ea4b7f1bfea5bcc16af966d1f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=8 +- data.max_prompt_length=256 +- data.max_response_length=256 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=4 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=1 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ec1d5a2d765e8d567d6564b8f708bd7a69d8f11 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 4 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 2 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 8 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 1 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b8eb11626ebc1e77d39ac0e2288c470fdee0d24 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=8 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=4 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=1 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..212dc359485a09e69637d0969ef12e6884751a18 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=8 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=4 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=1 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ec1d5a2d765e8d567d6564b8f708bd7a69d8f11 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 4 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 2 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 8 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 1 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bdc84d2a862fb28d60e4edbaceef18d23f39ee6 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=8 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=4 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=1 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..212dc359485a09e69637d0969ef12e6884751a18 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=8 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=4 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=1 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ec1d5a2d765e8d567d6564b8f708bd7a69d8f11 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 4 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 2 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 8 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 1 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea2ce354f91d73e35f331e0e208f0ec73547efde --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=8 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=4 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=1 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..212dc359485a09e69637d0969ef12e6884751a18 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=8 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=4 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=1 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..053ad4256269557f4e9948f7e44454bf42e28517 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 4 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 2 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 8 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: '' + experiment_name: '' + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 1 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..93cbb6436df074d32489293796f07bc3d1f55115 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=8 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=4 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name= + - trainer.experiment_name= + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=1 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a5e920e50bb4557935d2d4e7be1f8ac80d46241 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=8 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=4 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name= +- trainer.experiment_name= +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=1 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..053ad4256269557f4e9948f7e44454bf42e28517 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 4 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 2 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 2 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 8 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: '' + experiment_name: '' + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 1 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e72064b9dffaf1b54c89b4d73f2eef5ad170ca7c --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=8 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=4 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name= + - trainer.experiment_name= + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=1 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a5e920e50bb4557935d2d4e7be1f8ac80d46241 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=8 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=4 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name= +- trainer.experiment_name= +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=1 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc11d1bbff71d2857efbf1edef08a81d573a50f5 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-25-00/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-25-00/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17e2b6d7bdbee28362cf3ce481317dc0b5c563a1 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-35-01/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-35-01/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9c4a6e8292582158ff32826d10566cd72c3a6b9 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f7b60e241a232606009053fee7a9d12d876d1ae --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fd7a40a04ba1ff9603f5f0e84fe00a55f3796126 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a4d73e1cdafc11d4b34715fd5846cb4651b4b5b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dfff1d726a653988a9400013e9550e4f378045a8 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db8b3a32660528290cf5decb6a60a49f6f52151f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a4d73e1cdafc11d4b34715fd5846cb4651b4b5b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8886b384b143167ef57d3eaa008903f339532d10 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db8b3a32660528290cf5decb6a60a49f6f52151f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df91e2ddafa426bb1333e1c6947aca2f49cfc878 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ce101a017b96527d4997b6a52bc615c492d1182 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1137a7e54696f00ce70c95dc4c89e9f69d11ded --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df91e2ddafa426bb1333e1c6947aca2f49cfc878 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9adca99977e28d2d0dcbfa6012fc1d52661038d9 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1137a7e54696f00ce70c95dc4c89e9f69d11ded --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df91e2ddafa426bb1333e1c6947aca2f49cfc878 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..877c20bdd3a014869b3408e4ba4ff8a77d573516 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1137a7e54696f00ce70c95dc4c89e9f69d11ded --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..34450aec9ebe996139ae6513e150856e47ef8db0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..73400d35e07cb1c4358345307bf9337ad16cdb1b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4e9fd6c7e73a162244a962347f408997e5aa2b2 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..34450aec9ebe996139ae6513e150856e47ef8db0 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a1cbe8c3e8fef63358dbcc5754163f8966f31b1d --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4e9fd6c7e73a162244a962347f408997e5aa2b2 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/__init__.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1cd1e8433dffa0b3ba420be3e346f4f5cd062014 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_gpu.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..ff4a959b20f525c2d38248c56e3b3c57fc823b66 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_gpu.py @@ -0,0 +1,139 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest +import ray + +from tests.checkpoint_engine.test_utils import create_rollout_worker_group, create_trainer_worker_group +from verl.checkpoint_engine import CheckpointEngineManager +from verl.single_controller.ray.base import ( + RayResourcePool, + split_resource_pool, +) +from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig + + +@pytest.mark.asyncio +@pytest.mark.parametrize("rebuild_group", [False, True]) +@pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)]) +async def test_nccl_checkpoint_engine( + rebuild_group, + num_trainer, + num_rollout, + num_nodes=1, + num_gpus_per_node=8, + check_allclose=True, + model_path="~/models/Qwen/Qwen3-8B-Base", +): + model_path = os.path.expanduser(model_path) + ray.init( + runtime_env={ + "env_vars": { + "UCX_TLS": "rc,tcp,cuda", + "UCX_MAX_RNDV_RAILS": "4", + "UCX_LOG_LEVEL": "INFO", + "VERL_LOGGING_LEVEL": "DEBUG", + } + } + ) + + # initialize config + checkpoint_engine_config = CheckpointEngineConfig( + backend="nccl", engine_kwargs={"nccl": {"rebuild_group": rebuild_group}} + ) + model_config = HFModelConfig(path=model_path, use_remove_padding=True) + rollout_config = RolloutConfig(name="vllm", checkpoint_engine=checkpoint_engine_config) + + # create trainer and rollout worker group + resource_pool = RayResourcePool(process_on_nodes=[num_gpus_per_node] * num_nodes, max_colocate_count=3) + trainer_pool, rollout_pool = split_resource_pool(resource_pool, [num_trainer, num_rollout]) + trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config) + trainer.reset() + rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose) + + # create checkpoint engine manager + checkpoint_manager = CheckpointEngineManager(backend="nccl", trainer=trainer, replicas=replicas) + for _ in range(3): + await checkpoint_manager.update_weights() + rollout.check_weights() + + ray.shutdown() + + +@pytest.mark.skip(reason="temporary skip since our ci environment is not ready") +@pytest.mark.asyncio +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)]) +async def test_nixl_checkpoint_engine( + num_trainer, + num_rollout, + device, + num_nodes=1, + num_gpus_per_node=8, + check_allclose=True, + model_path="~/models/Qwen/Qwen3-8B-Base", +): + model_path = os.path.expanduser(model_path) + ray.init( + runtime_env={ + "env_vars": { + # TODO: it's pretty hard to set these environment variables right, please consult + # with your network admin. Maybe auto adjust UCX_* according to NCCL_IB_*? + "UCX_TLS": "rc,ud,cuda", + # "UCX_IB_GID_INDEX": "3", # NCCL_IB_GID_INDEX + # "UCX_IB_DEVICES": "mlx5_1:1,mlx5_2:1,mlx5_3:1", # NCCL_IB_HCA + "UCX_RC_TIMEOUT": "30s", # NCCL_IB_TIMEOUT + "UCX_RC_RETRY_COUNT": "7", # NCCL_IB_RETRY_COUNT + "UCX_KEEPALIVE_INTERVAL": "1s", + "UCX_KEEPALIVE_NUM_EPS": "10", + "UCX_MAX_RNDV_RAILS": "4", + "UCX_IB_ROCE_REACHABILITY_MODE": "all", + "UCX_LOG_LEVEL": "INFO", + "VERL_LOGGING_LEVEL": "DEBUG", + } + } + ) + + # initialize config + checkpoint_engine_config = CheckpointEngineConfig(backend="nixl", engine_kwargs={"nixl": {"device": device}}) + model_config = HFModelConfig(path=model_path, use_remove_padding=True) + rollout_config = RolloutConfig(name="vllm", checkpoint_engine=checkpoint_engine_config) + + # create trainer and rollout worker group + resource_pool = RayResourcePool(process_on_nodes=[num_gpus_per_node] * num_nodes, max_colocate_count=3) + trainer_pool, rollout_pool = split_resource_pool(resource_pool, [num_trainer, num_rollout]) + trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config) + trainer.reset() + rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose) + + # create checkpoint engine manager + checkpoint_manager = CheckpointEngineManager(backend="nixl", trainer=trainer, replicas=replicas) + for _ in range(3): + await checkpoint_manager.update_weights() + rollout.check_weights() + + ray.shutdown() + + +if __name__ == "__main__": + test_nccl_checkpoint_engine( + rebuild_group=False, + num_trainer=2, + num_rollout=30, + num_nodes=4, + num_gpus_per_node=8, + check_allclose=False, + model_path=os.environ["HDFS_ROOT"] + "/model/Qwen3-30B-A3B-Base", + ) diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_npu.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..b99fcc771bef4dca4eb13b836b436539fbb55172 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_npu.py @@ -0,0 +1,86 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest +import ray + +from tests.checkpoint_engine.test_utils import create_rollout_worker_group, create_trainer_worker_group +from verl.checkpoint_engine import CheckpointEngineManager +from verl.single_controller.ray.base import ( + RayResourcePool, + split_resource_pool, +) +from verl.utils.device import get_device_name +from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig + + +@pytest.mark.asyncio +@pytest.mark.parametrize("rebuild_group", [False]) +@pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)]) +async def test_hccl_checkpoint_engine( + rebuild_group, + num_trainer, + num_rollout, + num_nodes=1, + num_gpus_per_node=8, + check_allclose=True, + model_path="~/models/Qwen/Qwen3-8B-Base", +): + model_path = os.path.expanduser(model_path) + ray.init( + runtime_env={ + "env_vars": { + "HCCL_CONNECT_TIMEOUT": "1500", + "HCCL_HOST_SOCKET_PORT_RANGE": "60000-60050", + "HCCL_NPU_SOCKET_PORT_RANGE": "61000-61050", + "VERL_LOGGING_LEVEL": "DEBUG", + } + } + ) + + # initialize config + checkpoint_engine_config = CheckpointEngineConfig( + backend="hccl", engine_kwargs={"hccl": {"rebuild_group": rebuild_group}} + ) + model_config = HFModelConfig(path=model_path, use_remove_padding=True) + rollout_config = RolloutConfig(name="vllm", checkpoint_engine=checkpoint_engine_config) + + # create trainer and rollout worker group + resource_pool = RayResourcePool(process_on_nodes=[num_gpus_per_node] * num_nodes, max_colocate_count=3) + resource_pool.get_placement_groups(device_name=get_device_name()) + trainer_pool, rollout_pool = split_resource_pool(resource_pool, [num_trainer, num_rollout]) + trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config) + trainer.reset() + rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose) + + # create checkpoint engine manager + checkpoint_manager = CheckpointEngineManager(backend="hccl", trainer=trainer, replicas=replicas) + for _ in range(3): + await checkpoint_manager.update_weights() + rollout.check_weights() + + ray.shutdown() + + +if __name__ == "__main__": + test_hccl_checkpoint_engine( + rebuild_group=False, + num_trainer=2, + num_rollout=6, + num_nodes=1, + num_gpus_per_node=8, + check_allclose=False, + model_path=os.environ["HDFS_ROOT"] + "/model/Qwen3-30B-A3B-Base", + ) diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_special_server_adapter.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_special_server_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..193a9eaeb56035752bf82381770af1ecf63098a6 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_special_server_adapter.py @@ -0,0 +1,121 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import os + +import pytest +import ray +from omegaconf import DictConfig +from openai import AsyncOpenAI + +from tests.checkpoint_engine.test_utils import create_trainer_worker_group +from verl.checkpoint_engine import CheckpointEngineManager, CheckpointEngineWorker +from verl.single_controller.ray import ( + RayClassWithInitArgs, + RayResourcePool, + RayWorkerGroup, +) +from verl.utils.config import omega_conf_to_dataclass +from verl.utils.device import get_device_name +from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig +from verl.workers.rollout.replica import get_rollout_replica_class + + +@pytest.fixture +def init_config() -> DictConfig: + from hydra import compose, initialize_config_dir + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): + config = compose(config_name="ppo_trainer") + + config.trainer.n_gpus_per_node = 8 + config.trainer.nnodes = 1 + config.actor_rollout_ref.model.path = os.path.expanduser("~/models/Qwen/Qwen3-VL-2B-Instruct") + config.actor_rollout_ref.rollout.name = os.environ["ROLLOUT_NAME"] + config.actor_rollout_ref.rollout.skip_tokenizer_init = False + config.actor_rollout_ref.rollout.max_num_seqs = 256 + config.actor_rollout_ref.rollout.checkpoint_engine.backend = "nccl" if get_device_name() == "cuda" else "hccl" + + return config + + +@pytest.mark.asyncio +async def test_server_adapter(init_config): + ray.init( + runtime_env={ + "env_vars": { + "TOKENIZERS_PARALLELISM": "true", + "NCCL_DEBUG": "WARN", + "VLLM_LOGGING_LEVEL": "INFO", + "VLLM_USE_V1": "1", + "VLLM_DISABLE_COMPILE_CACHE": "1", + } + } + ) + + # 1. create trainer worker group + model_config: HFModelConfig = omega_conf_to_dataclass(init_config.actor_rollout_ref.model) + checkpoint_engine_config: CheckpointEngineConfig = omega_conf_to_dataclass( + init_config.actor_rollout_ref.rollout.checkpoint_engine + ) + trainer_pool = RayResourcePool(process_on_nodes=[4], max_colocate_count=3) + trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config) + trainer.reset() + + # 2. create rollout replicas + rollout_config: RolloutConfig = omega_conf_to_dataclass(init_config.actor_rollout_ref.rollout) + + # 2.1 create checkpoint engine worker group + rollout_pool = RayResourcePool(process_on_nodes=[4], max_colocate_count=3) + ray_cls_with_init = RayClassWithInitArgs( + cls=ray.remote(CheckpointEngineWorker), + model_config=model_config, + rollout_config=rollout_config, + ) + rollout = RayWorkerGroup( + resource_pool=rollout_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name() + ) + + # 2.2 create rollout replicas + rollout_replica_class = get_rollout_replica_class(rollout_config.name) + rollout_replicas = [ + rollout_replica_class( + replica_rank=replica_rank, + config=rollout_config, + model_config=model_config, + ) + for replica_rank in range(2) + ] + await asyncio.gather(*[replica.init_hybrid(rollout) for replica in rollout_replicas]) + + # 3. create checkpoint engine manager + checkpoint_manager = CheckpointEngineManager( + backend=checkpoint_engine_config.backend, trainer=trainer, replicas=rollout_replicas + ) + for i in range(3): + await checkpoint_manager.update_weights() + + server_addresses = rollout_replicas[i % len(rollout_replicas)].server_address + client = AsyncOpenAI( + api_key="123-abc", + base_url=f"http://{server_addresses}/v1", + ) + + completion = await client.chat.completions.create( + model=init_config.actor_rollout_ref.model.path, + messages=[{"role": "user", "content": "What can you do?"}], + ) + print("[OUTPUT]:", completion.choices[0].message.content) + + ray.shutdown() diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_utils.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..02e3c8f1031df0578fb7459a33d785ff8b2dbdf5 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_utils.py @@ -0,0 +1,179 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +from typing import Generator + +import ray +import torch +from transformers import AutoModelForCausalLM + +from verl.checkpoint_engine import CheckpointEngineRegistry, CheckpointEngineWorker +from verl.single_controller.base.decorator import Dispatch, register +from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup +from verl.utils.device import get_device_name +from verl.utils.fs import copy_to_local +from verl.workers.config import CheckpointEngineConfig, FSDPEngineConfig, HFModelConfig, RolloutConfig +from verl.workers.engine_workers import TrainingWorker, TrainingWorkerConfig +from verl.workers.rollout import BaseRollout, RolloutReplica + + +class TrainingWorkerTest(TrainingWorker): + def __init__(self, config: TrainingWorkerConfig, checkpoint_engine_config: CheckpointEngineConfig) -> None: + super().__init__(config) + backend = checkpoint_engine_config.backend + bucket_size = checkpoint_engine_config.update_weights_bucket_megabytes << 20 + engine_kwargs = checkpoint_engine_config.engine_kwargs.get(backend, {}) + self.checkpoint_engine = CheckpointEngineRegistry.new( + backend, is_master=(torch.distributed.get_rank() == 0), bucket_size=bucket_size, **engine_kwargs + ) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) + async def update_weights(self): + per_tensor_param, _ = self.engine.get_per_tensor_param() + await self.checkpoint_engine.send_weights(per_tensor_param) + + @register(dispatch_mode=Dispatch.DP_COMPUTE, blocking=False) + def execute_checkpoint_engine(self, method: str, *args, **kwargs): + return getattr(self.checkpoint_engine, method)(*args, **kwargs) + + +class MockServerAdapter(BaseRollout): + def __init__(self, config: RolloutConfig, model_config: HFModelConfig, check_allclose: bool = True): + super().__init__(config, model_config, device_mesh=None) + self.check_allclose = check_allclose + self.model = None + self.received_weights: dict[str, torch.Tensor] = {} + + async def resume(self, tags: list[str]): + raise NotImplementedError() + + async def release(self): + raise NotImplementedError() + + async def update_weights( + self, + weights: Generator[tuple[str, torch.Tensor], None, None], + **kwargs, + ): + async for name, weight in weights: + weight = weight.clone() + if self.check_allclose: + self.received_weights[name] = weight.clone() + + def check_weights(self): + if not self.check_allclose: + return + + if self.model is None: + local_path = copy_to_local(self.model_config.path) + self.model = AutoModelForCausalLM.from_pretrained(local_path, torch_dtype=torch.bfloat16, device_map="cpu") + + for name, weight in self.model.state_dict().items(): + assert name in self.received_weights, f"weight {name} not received" + received = self.received_weights[name] + assert torch.allclose(weight.to(received.device), received), f"weight {name} not equal" + self.received_weights.clear() + + +class MockReplica(RolloutReplica): + async def init_hybrid(self, worker_group: RayWorkerGroup): + """Init hybrid rollout server, rollout engine and training engine(fsdp/megatron) fused in same process. + + Args: + worker_group: RayWorkerGroup, fused workers where training engine(fsdp/megatron) have been initialized. + """ + self.workers = worker_group.workers[ + self.world_size * self.replica_rank : self.world_size * (self.replica_rank + 1) + ] + + def get_ray_class_with_init_args(self) -> RayClassWithInitArgs: + """Get rollout worker actor class for colocated and standalone mode.""" + raise NotImplementedError + + async def launch_servers(self): + """Launch http server in each node.""" + raise NotImplementedError + + +class CheckpointEngineWorkerTest(CheckpointEngineWorker): + def __init__(self, rollout_config: RolloutConfig, model_config: HFModelConfig, check_allclose: bool = True) -> None: + server_adapter = MockServerAdapter(rollout_config, model_config, check_allclose) + super().__init__(rollout_config, model_config, server_adapter) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def check_weights(self): + self.server_adapter.check_weights() + + +def create_trainer_worker_group( + resource_pool: RayResourcePool, model_config: HFModelConfig, checkpoint_engine_config: CheckpointEngineConfig +) -> RayWorkerGroup: + engine_config = FSDPEngineConfig(forward_only=True, fsdp_size=resource_pool.world_size, strategy="fsdp") + trainer_config = TrainingWorkerConfig( + model_type="language_model", + model_config=model_config, + engine_config=engine_config, + ) + + ray_cls_with_init = RayClassWithInitArgs( + cls=ray.remote(TrainingWorkerTest), + config=trainer_config, + checkpoint_engine_config=checkpoint_engine_config, + ) + ray_cls_with_init.update_options( + { + "runtime_env": { + "env_vars": { + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + } + } + } + ) + wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name()) + return wg + + +async def create_rollout_worker_group( + resource_pool: RayResourcePool, + model_config: HFModelConfig, + rollout_config: RolloutConfig, + check_allclose: bool = True, +) -> tuple[RayWorkerGroup, list[MockReplica]]: + # create rollout worker group + ray_cls_with_init = RayClassWithInitArgs( + cls=ray.remote(CheckpointEngineWorkerTest), + model_config=model_config, + rollout_config=rollout_config, + check_allclose=check_allclose, + ) + wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name()) + + # create rollout replicas + rollout_world_size = ( + rollout_config.tensor_model_parallel_size + * rollout_config.data_parallel_size + * rollout_config.pipeline_model_parallel_size + ) + num_replicas = wg.world_size // rollout_world_size + replicas = [] + for replica_rank in range(num_replicas): + replica = MockReplica( + replica_rank=replica_rank, + config=rollout_config, + model_config=model_config, + ) + replicas.append(replica) + await asyncio.gather(*[replica.init_hybrid(wg) for replica in replicas]) + + return wg, replicas diff --git a/code/RL_model/verl/verl_train/tests/models/test_engine.py b/code/RL_model/verl/verl_train/tests/models/test_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..9878ece4d067da42c14ead4c5af46b992fc561e7 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/models/test_engine.py @@ -0,0 +1,442 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["NCCL_DEBUG"] = "WARN" + +from functools import partial + +import numpy as np +import pytest +import ray +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoModelForTokenClassification, + AutoTokenizer, + Qwen3Config, + Qwen3MoeConfig, +) + +from verl import DataProto +from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup +from verl.trainer.config import CheckpointConfig +from verl.utils import tensordict_utils as tu +from verl.utils.model import compute_position_id_with_mask, create_random_mask +from verl.utils.torch_functional import logprobs_from_logits_naive +from verl.workers.config import ( + ActorConfig, + CriticConfig, + FSDPEngineConfig, + FSDPOptimizerConfig, + HFModelConfig, + McoreEngineConfig, + McoreOptimizerConfig, +) +from verl.workers.engine_workers import TrainingWorker, TrainingWorkerConfig +from verl.workers.utils.losses import ppo_loss, sft_loss, value_loss +from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding + + +def get_test_language_model(device_count): + if device_count == 1: + model = "~/models/HuggingFaceTB/SmolLM2-135M-Instruct" + else: + model = "~/models/Qwen/Qwen2.5-0.5B" + model = os.path.expanduser(model) + return model + + +def create_training_config(model_type, strategy, device_count, model): + if device_count == 1: + tp = pp = cp = fsdp_size = 1 + else: + tp = pp = cp = 2 + fsdp_size = 4 + + path = os.path.expanduser(model) + model_config = HFModelConfig(path=path, use_remove_padding=True) + + kwargs = dict( + param_offload=True, + optimizer_offload=True, + grad_offload=True, + use_dynamic_bsz=True, + use_remove_padding=True, + max_token_len_per_gpu=500, + infer_max_token_len_per_gpu=1000, + ) + + if strategy == "megatron": + engine_config = McoreEngineConfig( + forward_only=False, + use_mbridge=True, + tensor_model_parallel_size=tp, + pipeline_model_parallel_size=pp, + context_parallel_size=cp, + **kwargs, + ) + optimizer_config = McoreOptimizerConfig(lr_decay_steps=10) + elif strategy in ["fsdp", "fsdp2"]: + engine_config = FSDPEngineConfig( + forward_only=False, fsdp_size=fsdp_size, strategy=strategy, ulysses_sequence_parallel_size=cp, **kwargs + ) + optimizer_config = FSDPOptimizerConfig() + else: + raise NotImplementedError(f"strategy {strategy} is not supported") + + config = TrainingWorkerConfig( + model_type=model_type, + model_config=model_config, + engine_config=engine_config, + optimizer_config=optimizer_config, + checkpoint_config=None, + ) + return config + + +@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2", "megatron"]) +def test_actor_engine(strategy): + ray.init() + device_count = torch.cuda.device_count() + config = create_training_config( + model_type="language_model", + strategy=strategy, + device_count=device_count, + model=get_test_language_model(device_count), + ) + ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config) + resource_pool = RayResourcePool(process_on_nodes=[device_count]) + wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init) + # init model + wg.reset() + + sft_loss_ = partial(sft_loss, config=config) + + wg.set_loss_fn(sft_loss_) + + batch_size = 8 + seqlen = 32 + + response_length = seqlen // 2 + + torch.manual_seed(1) + np.random.seed(1) + + input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen)) + attention_mask = create_random_mask( + input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6 + ) + position_ids = compute_position_id_with_mask(attention_mask) + + global_token_num = torch.sum(attention_mask, dim=-1).tolist() + + print(input_ids.float().mean(), attention_mask.float().mean()) + + responses = input_ids[:, response_length:] + response_mask = attention_mask[:, response_length:] + + assert torch.all(response_mask[:, 0] == 1) + + data = DataProto.from_single_dict( + { + "input_ids": input_ids, + "prompts": input_ids[:, :response_length], + "attention_mask": attention_mask, + "position_ids": position_ids, + "responses": responses, + "response_mask": response_mask, + }, + meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False}, + ) + + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + + # eval + output = wg.infer_batch(data_td) + output = output.get() + logprobs_unpad = tu.get(output, "log_probs").cpu() + logprobs = no_padding_2_padding(logprobs_unpad, data_td) + + output = DataProto.from_single_dict({"old_log_probs": logprobs}) + + # load hf model and compare results with hf model + path = config.model_config.path + hf_model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16) + hf_output = hf_model(input_ids, attention_mask=attention_mask) + hf_logprobs = logprobs_from_logits_naive( + hf_output.logits[:, -response_length - 1 : -1, :].float(), input_ids[:, -response_length:] + ) + hf_logprobs_mean = torch.mean(hf_logprobs * response_mask) + mcore_logprobs_mean = torch.mean(output.batch["old_log_probs"] * response_mask) + + torch.testing.assert_close(hf_logprobs_mean, mcore_logprobs_mean, atol=1e-3, rtol=1e-2) + + data = data.union(output) + + # TODO: sft_loss_ is not compatible with ActorWorker until we replace DataProto with torch.jagged TensorDict + # wg.set_loss_fn(sft_loss_) + + # train for one step + # metrics = wg.update_actor(data) + # print(metrics) + + # add ppo data + data.batch["advantages"] = torch.rand_like(responses, dtype=torch.float32) + data.batch["ref_log_prob"] = torch.rand_like(responses, dtype=torch.float32) + + # construct actor config + actor_config = ActorConfig(strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1) + + # set ppo loss + ppo_loss_ = partial(ppo_loss, config=actor_config) + wg.set_loss_fn(ppo_loss_) + + # update again + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + + # auto load/offload + tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0]) + ppo_metrics = wg.train_batch(data_td) + ppo_metrics = ppo_metrics.get() + ppo_metrics = tu.get(ppo_metrics, "metrics") + print(ppo_metrics) + + # test manual load/offload + tu.assign_non_tensor(data_td, disable_auto_offload=True) + wg.to("device") + ppo_metrics = wg.train_batch(data_td) + ppo_metrics = ppo_metrics.get() + ppo_metrics = tu.get(ppo_metrics, "metrics") + print(ppo_metrics) + wg.to("cpu") + + ray.shutdown() + + +def create_value_model(language_model_path, output_path): + config = AutoConfig.from_pretrained(language_model_path) + config.num_labels = 1 + config.classifier_dropout = 0 + config.tie_word_embeddings = False + model = AutoModelForTokenClassification.from_config(config) + tokenizer = AutoTokenizer.from_pretrained(os.path.expanduser(language_model_path)) + assert model.config.num_labels == 1 + path = os.path.expanduser(output_path) + model.save_pretrained(path) + tokenizer.save_pretrained(path) + config.save_pretrained(path) + return path + + +@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2"]) +def test_critic_engine(strategy): + device_count = torch.cuda.device_count() + value_model_path = os.path.expanduser("~/models/test_model") + language_model_path = get_test_language_model(device_count=device_count) + create_value_model(language_model_path, value_model_path) + + torch.manual_seed(1) + np.random.seed(1) + + ray.init() + + config = create_training_config( + model_type="value_model", strategy=strategy, device_count=device_count, model=value_model_path + ) + ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config) + resource_pool = RayResourcePool(process_on_nodes=[device_count]) + wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init) + # init model + wg.reset() + + batch_size = 8 + seqlen = 32 + + response_length = seqlen // 2 + input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen)) + attention_mask = create_random_mask( + input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6 + ) + position_ids = compute_position_id_with_mask(attention_mask) + + global_token_num = torch.sum(attention_mask, dim=-1).tolist() + + print(input_ids.float().mean(), attention_mask.float().mean()) + + responses = input_ids[:, response_length:] + response_mask = attention_mask[:, response_length:] + + assert torch.all(response_mask[:, 0] == 1) + + data = DataProto.from_single_dict( + { + "input_ids": input_ids, + "prompts": input_ids[:, :response_length], + "attention_mask": attention_mask, + "position_ids": position_ids, + "responses": responses, + "response_mask": response_mask, + }, + meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False}, + ) + + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + + # eval + output = wg.infer_batch(data_td) + output = output.get() + + values_unpad = tu.get(output, "values").float().cpu() + values = no_padding_2_padding(values_unpad, data_td) + + output = DataProto.from_single_dict({"values": values}) + + # load hf model and compare results with hf model + with torch.device("cuda"), torch.autocast(device_type="cuda", dtype=torch.bfloat16): + hf_model = AutoModelForTokenClassification.from_pretrained( + value_model_path, torch_dtype=torch.float32, attn_implementation="flash_attention_2" + ) + hf_output = hf_model(input_ids.cuda(), attention_mask=attention_mask.cuda()) + hf_values = hf_output.logits[:, -response_length - 1 : -1, :].float().squeeze(-1).cpu() + + hf_values_mean = torch.mean(hf_values * response_mask) + engine_values = torch.mean(output.batch["values"] * response_mask) + + torch.testing.assert_close(hf_values_mean, engine_values, atol=1e-2, rtol=1e-2) + + data = data.union(output) + + # add ppo data + data.batch["returns"] = torch.rand_like(responses, dtype=torch.float32) + + # update again + # create critic config + critic_config = CriticConfig( + strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1, model_config=config.model_config + ) + value_loss_ = partial(value_loss, config=critic_config) + wg.set_loss_fn(value_loss_) + + # update again + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + + # auto load/offload + tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0]) + ppo_metrics = wg.train_batch(data_td) + ppo_metrics = ppo_metrics.get() + ppo_metrics = tu.get(ppo_metrics, "metrics") + print(ppo_metrics) + + ray.shutdown() + + +def create_actor_model(tmp_path, config): + model = AutoModelForCausalLM.from_config(config) + path = os.path.join(tmp_path, "test_model") + model.save_pretrained(path) + config.save_pretrained(path) + return path + + +def _worker(rank: int, world_size: int, rendezvous_file: str, strategy: str, model_path: str): + torch.cuda.set_device(rank) + dist.init_process_group( + backend="nccl", + init_method=f"file://{rendezvous_file}", + rank=rank, + world_size=world_size, + ) + + ref_model_config = AutoConfig.from_pretrained(model_path) + with torch.device("meta"): + ref_model = AutoModelForCausalLM.from_config(ref_model_config) + + from verl.workers.engine import BaseEngine, EngineRegistry + + # construct configs + model_config = HFModelConfig(path=model_path, load_tokenizer=False) + + if strategy == "megatron": + engine_config = McoreEngineConfig( + forward_only=False, + use_mbridge=True, + tensor_model_parallel_size=2, + pipeline_model_parallel_size=2, + context_parallel_size=1, + ) + optimizer_config = McoreOptimizerConfig(lr_decay_steps=10) + elif strategy in ["fsdp", "fsdp2"]: + engine_config = FSDPEngineConfig( + forward_only=False, fsdp_size=4, strategy=strategy, ulysses_sequence_parallel_size=2 + ) + optimizer_config = FSDPOptimizerConfig() + else: + raise NotImplementedError(f"strategy {strategy} is not supported") + + checkpoint_config = CheckpointConfig() + + # build model engine + engine: BaseEngine = EngineRegistry.new( + model_type="language_model", + backend=engine_config.strategy, + model_config=model_config, + engine_config=engine_config, + optimizer_config=optimizer_config, + checkpoint_config=checkpoint_config, + ) + + engine.initialize() + + # get per tensor parameter + per_tensor_params, _ = engine.get_per_tensor_param() + + ref_state_dict = ref_model.state_dict() + + # load ground truth and compare + for key, value in per_tensor_params: + assert key in ref_state_dict, f"{key} not in ref_state_dict" + assert value.shape == ref_state_dict[key].shape, ( + f"{key} shape not equal, {value.shape} != {ref_state_dict[key].shape}" + ) + if rank == 0: + print(key, value.shape) + + dist.barrier() + dist.destroy_process_group() + + +@pytest.mark.parametrize("world_size", [8]) +@pytest.mark.parametrize("config", [Qwen3Config(num_hidden_layers=2), Qwen3MoeConfig(num_hidden_layers=2)]) +@pytest.mark.parametrize("strategy", ["megatron", "fsdp", "fsdp2"]) +def test_per_tensor_generator(world_size, tmp_path, config, strategy): + rendezvous_file = str(tmp_path / "rdzv_mask") + os.makedirs(os.path.dirname(rendezvous_file), exist_ok=True) + # create a model + model_path = create_actor_model(tmp_path, config) + # spawn workers + mp.spawn( + fn=_worker, + args=(world_size, rendezvous_file, strategy, model_path), + nprocs=world_size, + join=True, + ) diff --git a/code/RL_model/verl/verl_train/tests/models/test_tiled_mlp_accuracy.py b/code/RL_model/verl/verl_train/tests/models/test_tiled_mlp_accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..6b022243ffe4ba15724fcf2c89f91a92e0b1e37c --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/models/test_tiled_mlp_accuracy.py @@ -0,0 +1,218 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test script to verify TiledMLP accuracy by comparing logits and gradients +between regular MLP and TiledMLP under FSDP2. +Run with: torchrun --nproc_per_node=2 tests/test_tiled_mlp_accuracy.py +""" + +import torch +import torch.distributed as dist +from torch.distributed.device_mesh import init_device_mesh +from torch.distributed.fsdp import fully_shard + + +def setup_distributed(): + dist.init_process_group(backend="nccl") + rank = dist.get_rank() + world_size = dist.get_world_size() + torch.cuda.set_device(rank) + return rank, world_size + + +def create_model(model_name="Qwen/Qwen3-1.7B", num_layers=2): + """Load a Qwen3-1.7B model with only 2 layers from pretrained weights.""" + from transformers import AutoConfig, AutoModelForCausalLM + + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + config.num_hidden_layers = num_layers + + model = AutoModelForCausalLM.from_pretrained( + model_name, + config=config, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + attn_implementation="flash_attention_2", + ) + return model + + +def apply_fsdp2(model, device_mesh): + """Apply FSDP2 sharding to model.""" + for layer in model.model.layers: + fully_shard(layer, mesh=device_mesh) + fully_shard(model, mesh=device_mesh) + return model + + +def run_forward_backward(model, input_ids, labels): + """Run forward and backward pass, return logits and gradients.""" + model.zero_grad() + + outputs = model(input_ids=input_ids, labels=labels) + logits = outputs.logits.clone().detach() + loss = outputs.loss + + loss.backward() + + # Collect MLP gradients + gradients = {} + for name, param in model.named_parameters(): + if "mlp" in name and param.grad is not None: + gradients[name] = param.grad.clone().detach() + + return logits, gradients, loss.item() + + +def compare_results(logits1, grads1, logits2, grads2, rank): + """Compare logits and gradients between two runs.""" + # Compare logits + logits_diff = (logits1 - logits2).abs() + logits_max_diff = logits_diff.max().item() + logits_mean_diff = logits_diff.mean().item() + + # Compare gradients (only for params that exist on this rank due to FSDP sharding) + all_pass = True + grad_results = [] + for name in sorted(grads1.keys()): + if name in grads2: + g1, g2 = grads1[name], grads2[name] + diff = (g1 - g2).abs() + max_diff = diff.max().item() + mean_diff = diff.mean().item() + + # Check if within tolerance (1e-2 for bf16) + passed = max_diff < 1e-2 + if not passed: + all_pass = False + grad_results.append((name, max_diff, mean_diff, passed)) + + # Only print on rank 0 to avoid duplicate output + if rank == 0: + print("\n=== Comparison Results ===") + print("\nLogits:") + print(f" Max diff: {logits_max_diff:.2e}") + print(f" Mean diff: {logits_mean_diff:.2e}") + + print("\nMLP Parameter Gradients:") + if grad_results: + for name, max_diff, mean_diff, passed in grad_results: + status = "✓" if passed else "✗" + print(f" {name}: max={max_diff:.2e}, mean={mean_diff:.2e} {status}") + else: + print(" (Gradients sharded to other ranks under FSDP2)") + + return all_pass + + +def main(): + rank, world_size = setup_distributed() + device_mesh = init_device_mesh("cuda", (world_size,)) + + model_name = "Qwen/Qwen3-1.7B" + num_layers = 2 + + if rank == 0: + print(f"Running TiledMLP accuracy test with {world_size} GPUs") + print(f"Model: {model_name} ({num_layers} layers, from pretrained)") + + dist.barrier() + + # ========== Create Model 1: WITHOUT TiledMLP ========== + if rank == 0: + print("\n" + "=" * 60) + print("Creating Model 1 (without TiledMLP)") + print("=" * 60) + + model1 = create_model(model_name, num_layers) + model1 = apply_fsdp2(model1, device_mesh) + model1 = model1.cuda() + + # Create deterministic input + torch.manual_seed(42) + batch_size, seq_len = 2, 256 + vocab_size = model1.config.vocab_size + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + labels = input_ids.clone() + + # ========== Run Model 1: WITHOUT TiledMLP ========== + if rank == 0: + print("\n" + "=" * 60) + print("Running forward/backward on Model 1 (without TiledMLP)") + print("=" * 60) + + logits1, grads1, loss1 = run_forward_backward(model1, input_ids, labels) + if rank == 0: + print(f"Loss: {loss1:.4f}") + + # Free model1 memory before creating model2 + del model1 + torch.cuda.empty_cache() + + dist.barrier() + + # ========== Create Model 2, apply TiledMLP patch, then FSDP2 ========== + if rank == 0: + print("\n" + "=" * 60) + print("Creating Model 2 (with TiledMLP, patch before FSDP2)") + print("=" * 60) + + model2 = create_model(model_name, num_layers) + + # Apply TiledMLP patch AFTER model instantiation but BEFORE FSDP2 wrap + if rank == 0: + print("Applying TiledMLP monkey patch before FSDP2...") + + from verl.models.transformers.tiled_mlp import apply_tiled_mlp_monkey_patch + + apply_tiled_mlp_monkey_patch(num_shards=4, model_type="qwen3") + + model2 = apply_fsdp2(model2, device_mesh) + model2 = model2.cuda() + + dist.barrier() + + # ========== Run Model 2: WITH TiledMLP ========== + if rank == 0: + print("\n" + "=" * 60) + print("Running forward/backward on Model 2 (with TiledMLP)") + print("=" * 60) + + logits2, grads2, loss2 = run_forward_backward(model2, input_ids, labels) + if rank == 0: + print(f"Loss: {loss2:.4f}") + + dist.barrier() + + # ========== Compare Results ========== + all_pass = compare_results(logits1, grads1, logits2, grads2, rank) + + dist.barrier() + + if rank == 0: + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Loss diff: {abs(loss1 - loss2):.2e}") + print(f"All gradient checks: {'PASS' if all_pass else 'FAIL'}") + + # Cleanup + del model2 + torch.cuda.empty_cache() + + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/models/test_transformer.py b/code/RL_model/verl/verl_train/tests/models/test_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ddd085497a16cd73e828bff596dd888d054827af --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/models/test_transformer.py @@ -0,0 +1,239 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from transformers import ( + ApertusConfig, + AutoModelForCausalLM, + AutoModelForTokenClassification, + GemmaConfig, + LlamaConfig, + MistralConfig, + Qwen2Config, +) + +from verl.utils.device import get_device_name + +if get_device_name() == "cuda": + from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input +elif get_device_name() == "npu": + from verl.utils.attention_utils import index_first_axis, pad_input, rearrange, unpad_input + +from verl.utils.model import compute_position_id_with_mask, create_random_mask +from verl.utils.torch_functional import log_probs_from_logits_all_rmpad, masked_mean + +# TODO(sgm): add more models for test +# we only need one scale for each model +test_configs = [ + LlamaConfig(num_hidden_layers=1), + MistralConfig(num_hidden_layers=1), + GemmaConfig(num_hidden_layers=1), + Qwen2Config(num_hidden_layers=1), + ApertusConfig(num_hidden_layers=1), +] + + +def test_hf_casual_models(): + batch_size = 4 + seqlen = 128 + response_length = 127 + + for config in test_configs: + # config = AutoConfig.from_pretrained(test_case) + with torch.device(get_device_name()): + model = AutoModelForCausalLM.from_config( + config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model = model.to(device=get_device_name()) + input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name()) + attention_mask = create_random_mask( + input_ids=input_ids, + max_ratio_of_left_padding=0.1, + max_ratio_of_valid_token=0.8, + min_ratio_of_valid_token=0.5, + ) + position_ids = compute_position_id_with_mask( + attention_mask + ) # TODO(sgm): we can construct the position_ids_rmpad here + + input_ids_rmpad, indices, *_ = unpad_input( + input_ids.unsqueeze(-1), attention_mask + ) # input_ids_rmpad (total_nnz, ...) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + # unpad the position_ids to align the rotary + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices + ).transpose(0, 1) + + # input with input_ids_rmpad and postition_ids to enable flash attention varlen + logits_rmpad = model( + input_ids_rmpad, position_ids=position_ids_rmpad, use_cache=False + ).logits # (1, total_nnz, vocab_size) + + origin_logits = model( + input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, use_cache=False + ).logits + origin_logits_rmpad, origin_logits_indices, *_ = unpad_input(origin_logits, attention_mask) + + logits_rmpad = logits_rmpad.squeeze(0) + log_probs = log_probs_from_logits_all_rmpad( + input_ids_rmpad=input_ids_rmpad, + logits_rmpad=logits_rmpad, + indices=indices, + batch_size=batch_size, + seqlen=seqlen, + response_length=response_length, + ) # (batch, seqlen) + origin_log_probs = log_probs_from_logits_all_rmpad( + input_ids_rmpad=input_ids_rmpad, + logits_rmpad=origin_logits_rmpad, + indices=origin_logits_indices, + batch_size=batch_size, + seqlen=seqlen, + response_length=response_length, + ) # (batch, seqlen) + + torch.testing.assert_close( + masked_mean(log_probs, attention_mask[:, -response_length - 1 : -1]), + masked_mean(origin_log_probs, attention_mask[:, -response_length - 1 : -1]), + atol=1e-2, + rtol=1e-5, + ) + print("Check pass") + + +def test_hf_value_models(): + batch_size = 4 + seqlen = 128 + + for config in test_configs: + # config = AutoConfig.from_pretrained(test_case) + config.num_labels = 1 + config.classifier_dropout = 0 + config.hidden_dropout = 0 + with torch.device(get_device_name()): + model = AutoModelForTokenClassification.from_config( + config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model = model.to(device=get_device_name()) + input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name()) + attention_mask = create_random_mask( + input_ids=input_ids, + max_ratio_of_left_padding=0.1, + max_ratio_of_valid_token=0.8, + min_ratio_of_valid_token=0.5, + ) + position_ids = compute_position_id_with_mask( + attention_mask + ) # TODO(sgm): we can construct the position_ids_rmpad here + + input_ids_rmpad, indices, *_ = unpad_input( + input_ids.unsqueeze(-1), attention_mask + ) # input_ids_rmpad (total_nnz, ...) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + # unpad the position_ids to align the rotary + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices + ).transpose(0, 1) + + origin_logits = model( + input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, use_cache=False + ).logits + + # input with input_ids_rmpad and postition_ids to enable flash attention varlen + rmpad_logits = model( + input_ids_rmpad, position_ids=position_ids_rmpad, use_cache=False + ).logits # (1, total_nnz, 1) + rmpad_logits = rmpad_logits.squeeze(0) + pad_logits = pad_input(rmpad_logits, indices, batch_size, seqlen=seqlen) + + torch.testing.assert_close( + masked_mean(pad_logits, attention_mask[:, :, None]), + masked_mean(origin_logits, attention_mask[:, :, None]), + atol=1e-2, + rtol=1e-5, + ) + print("Value model check pass") + + +def test_attn_implementation_override(): + """Test that attn_implementation override config is properly respected.""" + # Test case 1: Test the actual extraction logic (no network required) + test_cases = [ + ({}, "flash_attention_2"), # Default case + ({"attn_implementation": "eager"}, "eager"), # Override case + ({"attn_implementation": "sdpa"}, "sdpa"), # Another override + ({"other_config": "value"}, "flash_attention_2"), # No attn_implementation key + ] + + for override_config, expected in test_cases: + actual = override_config.get("attn_implementation", "flash_attention_2") + assert actual == expected, f"Expected {expected}, got {actual} for config {override_config}" + + # Test case 2: Test with local config creation (simulate FSDP worker behavior) + # Test default behavior + override_config_default = {} + attn_implementation_default = override_config_default.get("attn_implementation", "flash_attention_2") + assert attn_implementation_default == "flash_attention_2" + + # Test override behavior + override_config_eager = {"attn_implementation": "eager"} + attn_implementation_eager = override_config_eager.get("attn_implementation", "flash_attention_2") + assert attn_implementation_eager == "eager" + + # Test that we can create a config with specific attn_implementation + config_with_eager = LlamaConfig(num_hidden_layers=1, _attn_implementation="eager") + assert config_with_eager._attn_implementation == "eager" + + config_with_flash = LlamaConfig(num_hidden_layers=1, _attn_implementation="flash_attention_2") + assert config_with_flash._attn_implementation == "flash_attention_2" + + print("✓ All attn_implementation override config tests passed") + + +def test_fsdp_worker_attn_implementation_integration(): + """Test integration of attn_implementation with FSDP worker logic.""" + + # Mock the FSDP worker configuration scenario + mock_override_config = {"attn_implementation": "eager"} + + # Test the exact logic used in FSDP workers + attn_implementation = mock_override_config.get("attn_implementation", "flash_attention_2") + assert attn_implementation == "eager" + + # Test with empty config (should default) + mock_override_config_empty = {} + attn_implementation_default = mock_override_config_empty.get("attn_implementation", "flash_attention_2") + assert attn_implementation_default == "flash_attention_2" + + # Test that the parameter would be passed correctly to both AutoConfig and Model + expected_calls = [ + ("AutoConfig.from_pretrained", {"attn_implementation": attn_implementation}), + ("AutoModel.from_pretrained", {"attn_implementation": attn_implementation}), + ] + + # Verify the parameter extraction works as expected + for call_name, expected_params in expected_calls: + assert expected_params["attn_implementation"] == "eager" + + print("✓ FSDP worker integration test passed") + + +if __name__ == "__main__": + test_hf_casual_models() + test_hf_value_models() + test_attn_implementation_override() + test_fsdp_worker_attn_implementation_integration() diff --git a/code/RL_model/verl/verl_train/tests/models/test_transformers_ulysses.py b/code/RL_model/verl/verl_train/tests/models/test_transformers_ulysses.py new file mode 100644 index 0000000000000000000000000000000000000000..b3387927885f00cb928312bd955ab1210a067e6b --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/models/test_transformers_ulysses.py @@ -0,0 +1,283 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import copy +from dataclasses import dataclass + +import pytest +import torch +import torch.distributed +import transformers +from packaging import version +from torch.distributed import init_device_mesh +from transformers import AutoModelForCausalLM, LlamaConfig, PretrainedConfig, Qwen2Config + +from verl.models.transformers.monkey_patch import apply_monkey_patch +from verl.protocol import DataProto +from verl.utils.device import get_device_name, get_torch_device +from verl.utils.distributed import initialize_global_process_group +from verl.utils.model import compute_position_id_with_mask, create_random_mask +from verl.utils.ulysses import ( + gather_outputs_and_unpad, + get_ulysses_sequence_parallel_world_size, + set_ulysses_sequence_parallel_group, + ulysses_pad_and_slice_inputs, +) +from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager + +if get_device_name() == "cuda": + from flash_attn.bert_padding import index_first_axis, rearrange, unpad_input +elif get_device_name() == "npu": + from verl.utils.attention_utils import index_first_axis, rearrange, unpad_input + +# TODO(sgm): add more models for test +# we only need one scale for each model + + +@dataclass +class SequenceParallelConfig: + config: PretrainedConfig + sp_size: int + is_valid: bool + + +def test_configs(): + configs = [ + SequenceParallelConfig( + LlamaConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32), sp_size=8, is_valid=True + ), + SequenceParallelConfig( + Qwen2Config(num_hidden_layers=2, num_attention_heads=28, num_key_value_heads=4, hidden_size=3584), + sp_size=4, + is_valid=True, + ), + SequenceParallelConfig( + Qwen2Config(num_hidden_layers=2, num_attention_heads=28, num_key_value_heads=4, hidden_size=3584), + sp_size=8, + is_valid=False, + ), + SequenceParallelConfig( + Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4), sp_size=4, is_valid=True + ), + SequenceParallelConfig( + Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4), sp_size=8, is_valid=True + ), + ] + + if version.parse(transformers.__version__) >= version.parse("4.56.0"): + from transformers import ApertusConfig + + configs.append( + SequenceParallelConfig( + ApertusConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32, hidden_size=4096), + sp_size=8, + is_valid=True, + ) + ) + + return configs + + +def sync_model_parameters_global(layer): + # synchronize weights + for p in layer.parameters(): + torch.distributed.broadcast(tensor=p.data, src=0) + + +@pytest.mark.parametrize("test_config", test_configs()) +def test_hf_casual_fwd_bwd(test_config): + if not torch.distributed.is_initialized(): + initialize_global_process_group() + + context = contextlib.nullcontext() if test_config.is_valid else pytest.raises(AssertionError) + with context: + world_size = torch.distributed.get_world_size() + _hf_casual_fwd_bwd(test_config.config, test_config.sp_size, world_size // test_config.sp_size) + + # TODO: seems not work, will cause `socketStartConnect: Connect to xxx failed : Software caused connection abort` + # torch.distributed.destroy_process_group() + + +def _hf_casual_fwd(config, sp_size, dp_size): + assert get_torch_device().device_count() >= 2, "need at least 2 gpus for test" + + ulysses_device_mesh = init_device_mesh( + device_type=get_device_name(), mesh_shape=(dp_size, sp_size), mesh_dim_names=("dp", "sp") + ) + sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh) + + batch_size = 1 + seqlen = 128 + # response_length = 127 + + # patch before load + with torch.device(get_device_name()): + model = AutoModelForCausalLM.from_config( + config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + apply_monkey_patch(model, sp_size) + model = model.to(device=get_device_name()) + sync_model_parameters_global(model) + + # different rank will generate different input_ids following fsdp + input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name()) + attention_mask = create_random_mask( + input_ids=input_ids, max_ratio_of_left_padding=0, max_ratio_of_valid_token=0.9, min_ratio_of_valid_token=0.8 + ) + position_ids = compute_position_id_with_mask( + attention_mask + ) # TODO(sgm): we can construct the position_ids_rmpad here + + model_inputs = { + "input_ids": input_ids.to(get_device_name()), + "attention_mask": attention_mask.to(get_device_name()), + "position_ids": position_ids.int().to(get_device_name()), + } + + model_inputs = DataProto.from_dict(model_inputs) + + # 1. perform ulysses forward + with sharding_manager: + model_inputs = sharding_manager.preprocess_data(model_inputs) + input_ids = model_inputs.batch["input_ids"] + attention_mask = model_inputs.batch["attention_mask"] + position_ids = model_inputs.batch["position_ids"] + input_ids_rmpad, indices, *_ = unpad_input( + input_ids.unsqueeze(-1), attention_mask + ) # input_ids_rmpad (total_nnz, ...) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + # unpad the position_ids to align the rotary + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices + ).transpose(0, 1) + + # slice input tensor for ulysses + # input_ids are padded and sliced + # postition_ids are only padded but not sliced + input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs( + input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size() + ) + + # input with input_ids_rmpad and postition_ids to enable flash attention varlen + logits_split_in_seq = model( + input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded, use_cache=False + ).logits # (1, total_nnz/n, vocab_size) + + # all_gather output + logits_full = gather_outputs_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size) + + # 2. perform normal forward + set_ulysses_sequence_parallel_group(None) + logits_rmpad_local = model( + input_ids_rmpad, position_ids=position_ids_rmpad, use_cache=False + ).logits # (1, total_nnz, vocab_size) + + mean_local = logits_rmpad_local.mean() + mean_full = logits_full.mean() + torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=1e-5) + + +def _hf_casual_fwd_bwd(config, sp_size, dp_size): + assert get_torch_device().device_count() >= 2, "need at least 2 gpus for test" + + ulysses_device_mesh = init_device_mesh( + device_type=get_device_name(), mesh_shape=(dp_size, sp_size), mesh_dim_names=("dp", "sp") + ) + sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh) + + batch_size = 1 + seqlen = 128 + # response_length = 127 + + # patch before load + with torch.device(get_device_name()): + model = AutoModelForCausalLM.from_config( + config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + apply_monkey_patch(model, sp_size) + model = model.to(device=get_device_name()) + sync_model_parameters_global(model) + + # different rank will generate different input_ids following fsdp + input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name()) + attention_mask = create_random_mask( + input_ids=input_ids, max_ratio_of_left_padding=0, max_ratio_of_valid_token=0.9, min_ratio_of_valid_token=0.8 + ) + position_ids = compute_position_id_with_mask( + attention_mask + ) # TODO(sgm): we can construct the position_ids_rmpad here + + model_inputs = { + "input_ids": input_ids.to(get_device_name()), + "attention_mask": attention_mask.to(get_device_name()), + "position_ids": position_ids.int().to(get_device_name()), + } + + model_inputs = DataProto.from_dict(model_inputs) + + # 1. perform ulysses forward + with sharding_manager: + model_inputs = sharding_manager.preprocess_data(model_inputs) + input_ids = model_inputs.batch["input_ids"] + attention_mask = model_inputs.batch["attention_mask"] + position_ids = model_inputs.batch["position_ids"] + input_ids_rmpad, indices, *_ = unpad_input( + input_ids.unsqueeze(-1), attention_mask + ) # input_ids_rmpad (total_nnz, ...) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + # unpad the position_ids to align the rotary + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices + ).transpose(0, 1) + + # slice input tensor for ulysses + # input_ids are padded and sliced + # postition_ids are only padded but not sliced + input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs( + input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size() + ) + + # input with input_ids_rmpad and postition_ids to enable flash attention varlen + logits_split_in_seq = model( + input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded, use_cache=False + ).logits # (1, total_nnz/n, vocab_size) + + # all_gather output + logits_full = gather_outputs_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size) + + # 2. perform normal forward + set_ulysses_sequence_parallel_group(None) + input_ids_full = copy.deepcopy(input_ids_rmpad) + position_ids_full = copy.deepcopy(position_ids_rmpad) + model_no_sp = copy.deepcopy(model) + logits_rmpad_local = model_no_sp( + input_ids_full, position_ids=position_ids_full, use_cache=False + ).logits # (1, total_nnz, vocab_size) + + mean_local = logits_rmpad_local.mean() + mean_full = logits_full.mean() + + mean_full.backward() + mean_local.backward() + + # 3. check the gradients + grad = model.model.layers[0].self_attn.q_proj.weight.grad + grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad + torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=3e-5) + # The check should be less strict because the gradient is not an averaged value. + torch.testing.assert_close(grad, grad_full, rtol=1e-2, atol=1e-3) + + +if __name__ == "__main__": + pytest.main([__file__, "-svv"]) diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_get_set_dispatch_collect_cpu.py b/code/RL_model/verl/verl_train/tests/single_controller/test_get_set_dispatch_collect_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..2b832da89910d1876fdaed7ad88e02170e5c35c1 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_get_set_dispatch_collect_cpu.py @@ -0,0 +1,47 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest + +from verl.single_controller.base import Worker + + +def test_get_set_dispatch_collect_cpu(): + os.environ["RANK"] = "0" + os.environ["LOCAL_RANK"] = "0" + os.environ["WORLD_SIZE"] = "2" + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12345" + + ref = Worker() + ref._register_dispatch_collect_info(mesh_name="actor", dp_rank=0, is_collect=True) + + actor = Worker() + actor._register_dispatch_collect_info(mesh_name="actor", dp_rank=1, is_collect=False) + + actor_rollout_ref = Worker() + actor_rollout_ref.set_dispatch_collect(mesh_name="ref", **ref.get_dispatch_collect()) + actor_rollout_ref.set_dispatch_collect(mesh_name="actor", **actor.get_dispatch_collect()) + + assert actor_rollout_ref._query_dispatch_info("ref") == 0 + assert actor_rollout_ref._query_collect_info("ref") + assert actor_rollout_ref._query_dispatch_info("actor") == 1 + assert not actor_rollout_ref._query_collect_info("actor") + + # test conflict mesh_name + actor2 = Worker() + actor2._register_dispatch_collect_info(mesh_name="actor", dp_rank=1, is_collect=False) + with pytest.raises(AssertionError): + actor_rollout_ref.set_dispatch_collect(mesh_name="actor", **actor2.get_dispatch_collect()) diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_nested_worker.py b/code/RL_model/verl/verl_train/tests/single_controller/test_nested_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..99145e5949ee9bf03f85f4201f1e025b42b4e200 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_nested_worker.py @@ -0,0 +1,75 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import ray + +from verl.single_controller.base.decorator import Dispatch, register +from verl.single_controller.base.worker import Worker +from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup +from verl.utils.device import get_device_name + + +class TestActor(Worker): + # TODO: pass *args and **kwargs is bug prone and not very convincing + def __init__(self, x) -> None: + super().__init__() + self.a = x + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def get(self): + return self.a + self.rank + + +class TestHighLevelActor(Worker): + def __init__(self, x=None) -> None: + super().__init__() + self.test_actor = TestActor(x=x) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def get(self): + return self.test_actor.get() + + +def test_nested_worker(): + ray.init(num_cpus=100) + + # create 4 workers, each hold a GPU + resource_pool = RayResourcePool([4], use_gpu=True) + class_with_args = RayClassWithInitArgs(cls=ray.remote(TestActor), x=2) + + worker_group = RayWorkerGroup( + resource_pool=resource_pool, + ray_cls_with_init=class_with_args, + name_prefix="worker_group_basic", + device_name=get_device_name(), + ) + + output = worker_group.get() + + assert output == [2, 3, 4, 5] + + class_with_args = RayClassWithInitArgs(cls=ray.remote(TestHighLevelActor), x=2) + high_level_worker_group = RayWorkerGroup( + resource_pool=resource_pool, + ray_cls_with_init=class_with_args, + name_prefix="worker_group_basic_2", + device_name=get_device_name(), + ) + + output_1 = high_level_worker_group.get() + + assert output_1 == [2, 3, 4, 5] + + ray.shutdown() diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_ray_collectives.py b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_collectives.py new file mode 100644 index 0000000000000000000000000000000000000000..3722a8f8029313bad6070d8d0ed2b9a29e4f3770 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_collectives.py @@ -0,0 +1,113 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test for using ray collective group. +Suppose we Actor and Rollout. Actor contains 4 workers and Rollout contains 2 workers. We established a Worker to +Rollout relationship by using collective groups +Actor: rank 0, 1 - Rollout rank 0 +Rollout rank 2, 3 - Rollout rank 1 +Then, we initiate 4 p2p comms from actor to rollout +""" + +import ray +import ray.util.collective as collective +import torch + +from verl.single_controller.base import Worker +from verl.single_controller.base.decorator import Dispatch, register +from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup + + +@ray.remote +class Actor(Worker): + @register(Dispatch.ONE_TO_ALL) + def init(self): + remote_rank = self.rank // 2 + self.group_name = f"A{self.rank}_R{remote_rank}" + collective.init_collective_group(world_size=2, rank=0, backend="nccl", group_name=self.group_name) + + @register(Dispatch.ONE_TO_ALL, blocking=False) + def send_tensors(self): + tensor = torch.ones(size=(4,), dtype=torch.float32, device="cuda") * self.rank + collective.send(tensor=tensor, dst_rank=1, group_name=self.group_name) + + +@ray.remote +class Rollout(Worker): + @register(Dispatch.ONE_TO_ALL) + def init(self): + self.remote_first_rank = self.rank * 2 + self.remote_second_rank = self.remote_first_rank + 1 + self.first_group_name = f"A{self.remote_first_rank}_R{self.rank}" + self.second_group_name = f"A{self.remote_second_rank}_R{self.rank}" + + collective.init_collective_group(world_size=2, rank=1, backend="nccl", group_name=self.first_group_name) + collective.init_collective_group(world_size=2, rank=1, backend="nccl", group_name=self.second_group_name) + + @register(Dispatch.ONE_TO_ALL, blocking=False) + def receive_tensors(self): + self.tensor1 = torch.randn(size=(4,), dtype=torch.float32, device="cuda") + self.tensor2 = torch.randn(size=(4,), dtype=torch.float32, device="cuda") + + collective.recv(self.tensor1, src_rank=0, group_name=self.first_group_name) + collective.recv(self.tensor2, src_rank=0, group_name=self.second_group_name) + + @register(Dispatch.ONE_TO_ALL) + def get_tensors(self): + return {f"src_{self.remote_first_rank}": self.tensor1, f"src_{self.remote_second_rank}": self.tensor2} + + +def test_ray_collective_group(): + ray.init() + + actor_resource_pool = RayResourcePool([4]) + rollout_resource_pool = RayResourcePool([2]) + + actor_cls = RayClassWithInitArgs(cls=Actor) + rollout_cls = RayClassWithInitArgs(cls=Rollout) + + actor_wg = RayWorkerGroup( + resource_pool=actor_resource_pool, ray_cls_with_init=actor_cls, name_prefix="collective_group_actor" + ) + rollout_wg = RayWorkerGroup( + resource_pool=rollout_resource_pool, ray_cls_with_init=rollout_cls, name_prefix="collective_group_rollout" + ) + + actor_wg.init() + rollout_wg.init() + + out1 = actor_wg.send_tensors() + out2 = rollout_wg.receive_tensors() + + # block to wait + ray.get(out1) + ray.get(out2) + + output = rollout_wg.get_tensors() + + rollout_0_output = output[0] + rollout_1_output = output[1] + + output = rollout_0_output | rollout_1_output + + print(output) + + for i in range(4): + assert torch.sum(output[f"src_{i}"]).item() == 4 * i + + ray.shutdown() + + +if __name__ == "__main__": + test_ray_collective_group() diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_ray_local_envs_on_cpu.py b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_local_envs_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..6c51beeaf3f8600387ce14fe63c97a5c804c4237 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_local_envs_on_cpu.py @@ -0,0 +1,91 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +e2e test verl.single_controller.ray +""" + +import os + +import ray + +from verl.single_controller.base.worker import Worker +from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup + + +@ray.remote +class TestActor(Worker): + def __init__(self) -> None: + super().__init__() + + def getenv(self, key): + val = os.getenv(key, f"{key} not set") + return val + + +def test_basics(): + ray.init(num_cpus=100) + + # create 4 workers, each hold a GPU + resource_pool = RayResourcePool([4], use_gpu=False) + class_with_args = RayClassWithInitArgs(cls=TestActor) + + worker_group = RayWorkerGroup( + resource_pool=resource_pool, ray_cls_with_init=class_with_args, name_prefix="worker_group_basic" + ) + + output = worker_group.execute_all_sync("getenv", key="RAY_LOCAL_WORLD_SIZE") + assert output == ["4", "4", "4", "4"] + + ray.shutdown() + + +def test_customized_worker_env(): + ray.init(num_cpus=100) + + # create 4 workers, each hold a GPU + resource_pool = RayResourcePool([4], use_gpu=False) + class_with_args = RayClassWithInitArgs(cls=TestActor) + + worker_group = RayWorkerGroup( + resource_pool=resource_pool, + ray_cls_with_init=class_with_args, + name_prefix="worker_group_customized", + worker_env={ + "test_key": "test_value", # new key will be appended + }, + ) + + output = worker_group.execute_all_sync("getenv", key="test_key") + assert output == ["test_value", "test_value", "test_value", "test_value"] + + try: + worker_group = RayWorkerGroup( + resource_pool=resource_pool, + ray_cls_with_init=class_with_args, + name_prefix="worker_group_error", + worker_env={ + "WORLD_SIZE": "100", # override system env will result in error + }, + ) + except ValueError as e: + assert "WORLD_SIZE" in str(e) + else: + raise ValueError("test failed") + + ray.shutdown() + + +if __name__ == "__main__": + test_basics() + test_customized_worker_env() diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_ray_utils_on_cpu.py b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_utils_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..e36497d210f6ec5daa8b9d559987f5dcc3974af2 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_utils_on_cpu.py @@ -0,0 +1,54 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import ray + +from verl.utils.ray_utils import parallel_put + + +# Initialize Ray for testing if not already done globally +@pytest.fixture() +def init_ray(): + ray.init(num_cpus=4) + yield + ray.shutdown() + + +def test_parallel_put_basic(init_ray): + data = [1, "hello", {"a": 2}, [3, 4]] + refs = parallel_put(data) + assert len(refs) == len(data) + retrieved_data = [ray.get(ref) for ref in refs] + assert retrieved_data == data + + +def test_parallel_put_empty(init_ray): + data = [] + with pytest.raises(AssertionError): + _ = parallel_put(data) + + +def test_parallel_put_workers(init_ray): + data = list(range(20)) + # Test with specific number of workers + refs = parallel_put(data, max_workers=4) + assert len(refs) == len(data) + retrieved_data = [ray.get(ref) for ref in refs] + assert retrieved_data == data + # Test with default workers (should cap) + refs_default = parallel_put(data) + assert len(refs_default) == len(data) + retrieved_data_default = [ray.get(ref) for ref in refs_default] + assert retrieved_data_default == data diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_split_resource_pool.py b/code/RL_model/verl/verl_train/tests/single_controller/test_split_resource_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb32606cf36e83bf41fb59154ce72c51928b804 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_split_resource_pool.py @@ -0,0 +1,181 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import ray +import torch + +from verl import DataProto +from verl.single_controller.base import Worker +from verl.single_controller.base.decorator import Dispatch, register +from verl.single_controller.ray.base import ( + RayClassWithInitArgs, + RayResourcePool, + RayWorkerGroup, + split_resource_pool, +) +from verl.utils.device import get_device_name, get_nccl_backend + + +@ray.remote +class Actor(Worker): + def __init__(self, worker_id) -> None: + super().__init__() + self.worker_id = worker_id + self.temp_tensor = torch.rand(4096, 4096).to(get_device_name()) + + if not torch.distributed.is_initialized(): + rank = int(os.environ.get("RANK", 0)) + world_size = int(os.environ.get("WORLD_SIZE", 1)) + torch.distributed.init_process_group(backend=get_nccl_backend(), world_size=world_size, rank=rank) + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def add(self, data: DataProto): + data.batch["a"] += self.rank + self.worker_id + return data + + +def test_split_resource_pool_with_split_size(): + ray.init() + # assume we have 2 nodes, with 4 GPUs each + global_resource_pool = RayResourcePool(process_on_nodes=[4, 4]) + global_resource_pool.get_placement_groups(device_name=get_device_name()) + + # first 4 gpus for actor_1, last 4 gpus for actor_2 + actor_1_resource_pool, actor_2_resource_pool = split_resource_pool(resource_pool=global_resource_pool, split_size=4) + actor_cls_1 = RayClassWithInitArgs(cls=Actor, worker_id=0) + actor_cls_2 = RayClassWithInitArgs(cls=Actor, worker_id=100) + actor_worker_1 = RayWorkerGroup( + resource_pool=actor_1_resource_pool, ray_cls_with_init=actor_cls_1, device_name=get_device_name() + ) + actor_worker_2 = RayWorkerGroup( + resource_pool=actor_2_resource_pool, ray_cls_with_init=actor_cls_2, device_name=get_device_name() + ) + assert actor_worker_1.world_size == 4 + assert actor_worker_2.world_size == 4 + + data = DataProto.from_dict({"a": torch.zeros(8)}) + actor_output_1 = actor_worker_1.add(data) + actor_output_2 = actor_worker_2.add(data) + assert actor_output_1.batch["a"].tolist() == [0, 0, 1, 1, 2, 2, 3, 3] + assert actor_output_2.batch["a"].tolist() == [100, 100, 101, 101, 102, 102, 103, 103] + + ray.shutdown() + + +def test_split_resource_pool_with_split_size_list(): + ray.init() + # assume we have 4 nodes, with 2 GPUs each + global_resource_pool = RayResourcePool(process_on_nodes=[2, 2, 2, 2]) + global_resource_pool.get_placement_groups(device_name=get_device_name()) + + # first 2 gpus for actor_1, last 6 gpus for actor_2 + actor_1_resource_pool, actor_2_resource_pool = split_resource_pool( + resource_pool=global_resource_pool, + split_size=[2, 6], + ) + actor_cls_1 = RayClassWithInitArgs(cls=Actor, worker_id=0) + actor_cls_2 = RayClassWithInitArgs(cls=Actor, worker_id=100) + actor_worker_1 = RayWorkerGroup( + resource_pool=actor_1_resource_pool, ray_cls_with_init=actor_cls_1, device_name=get_device_name() + ) + actor_worker_2 = RayWorkerGroup( + resource_pool=actor_2_resource_pool, ray_cls_with_init=actor_cls_2, device_name=get_device_name() + ) + assert actor_worker_1.world_size == 2 + assert actor_worker_2.world_size == 6 + + data_1 = DataProto.from_dict({"a": torch.zeros(4)}) + data_2 = DataProto.from_dict({"a": torch.zeros(6)}) + actor_output_1 = actor_worker_1.add(data_1) + actor_output_2 = actor_worker_2.add(data_2) + print(actor_output_1.batch["a"].tolist()) + print(actor_output_2.batch["a"].tolist()) + assert actor_output_1.batch["a"].tolist() == [0, 0, 1, 1] + assert actor_output_2.batch["a"].tolist() == [100, 101, 102, 103, 104, 105] + + ray.shutdown() + + +def test_split_resource_pool_with_split_size_list_cross_nodes(): + ray.init() + # assume we have 4 nodes, with 2 GPUs each + global_resource_pool = RayResourcePool(process_on_nodes=[4, 4]) + global_resource_pool.get_placement_groups(device_name=get_device_name()) + + # first 2 gpus for actor_1, last 6 gpus for actor_2 + actor_1_resource_pool, actor_2_resource_pool = split_resource_pool( + resource_pool=global_resource_pool, + split_size=[2, 6], + ) + actor_cls_1 = RayClassWithInitArgs(cls=Actor, worker_id=0) + actor_cls_2 = RayClassWithInitArgs(cls=Actor, worker_id=100) + actor_worker_1 = RayWorkerGroup( + resource_pool=actor_1_resource_pool, ray_cls_with_init=actor_cls_1, device_name=get_device_name() + ) + actor_worker_2 = RayWorkerGroup( + resource_pool=actor_2_resource_pool, ray_cls_with_init=actor_cls_2, device_name=get_device_name() + ) + + assert actor_worker_1.world_size == 2 + assert actor_worker_2.world_size == 6 + + data_1 = DataProto.from_dict({"a": torch.zeros(4)}) + data_2 = DataProto.from_dict({"a": torch.zeros(6)}) + actor_output_1 = actor_worker_1.add(data_1) + actor_output_2 = actor_worker_2.add(data_2) + print(actor_output_1.batch["a"].tolist()) + print(actor_output_2.batch["a"].tolist()) + assert actor_output_1.batch["a"].tolist() == [0, 0, 1, 1] + assert actor_output_2.batch["a"].tolist() == [100, 101, 102, 103, 104, 105] + + ray.shutdown() + + +def test_split_resource_pool_with_split_twice(): + ray.init() + + # assume we have 4 nodes, with 2 GPUs each + global_resource_pool = RayResourcePool(process_on_nodes=[2, 2, 2, 2]) + global_resource_pool.get_placement_groups(device_name=get_device_name()) + + # actors with [2, 1, 1, 1, 1, 2] (split twice) + rp_1, rp_2, rp_3 = split_resource_pool( + resource_pool=global_resource_pool, + split_size=[2, 4, 2], + ) + rp_2_1, rp_2_2, rp_2_3, rp_2_4 = split_resource_pool( + resource_pool=rp_2, + split_size=1, + ) + fp_list = [rp_1, rp_2_1, rp_2_2, rp_2_3, rp_2_4, rp_3] + correct_world_size = [2, 1, 1, 1, 1, 2] + correct_output = [ + [0.0, 0.0, 1.0, 1.0], # 2 worker + [100.0, 100.0, 100.0, 100.0], # 1 worker + [200.0, 200.0, 200.0, 200.0], # 1 worker + [300.0, 300.0, 300.0, 300.0], # 1 worker + [400.0, 400.0, 400.0, 400.0], # 1 worker + [500.0, 500.0, 501.0, 501.0], # 2 worker + ] + for idx, rp in enumerate(fp_list): + actor_cls = RayClassWithInitArgs(cls=Actor, worker_id=idx * 100) + actor_worker = RayWorkerGroup(resource_pool=rp, ray_cls_with_init=actor_cls, device_name=get_device_name()) + data = DataProto.from_dict({"a": torch.zeros(4)}) + actor_output = actor_worker.add(data) + assert actor_worker.world_size == correct_world_size[idx] + assert actor_output.batch["a"].tolist() == correct_output[idx] + + ray.shutdown() diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_basics.py b/code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_basics.py new file mode 100644 index 0000000000000000000000000000000000000000..13075d7b8ec4b3ec684894ac705c2cb887412fce --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_basics.py @@ -0,0 +1,147 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +e2e test verl.single_controller.ray +""" + +import ray +import torch + +from verl.single_controller.base.decorator import Dispatch, Execute, collect_all_to_all, register +from verl.single_controller.base.worker import Worker +from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup +from verl.utils.device import get_device_name + + +def two_to_all_dispatch_fn(worker_group, *args, **kwargs): + """ + Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker. + """ + for arg in args: + assert len(arg) == 2 + for i in range(worker_group.world_size - 2): + arg.append(arg[i % 2]) + for k, v in kwargs.items(): + assert len(v) == 2 + for i in range(worker_group.world_size - 2): + v.append(v[i % 2]) + return args, kwargs + + +def get_ray_remote_options() -> str: + """Function that gets the torch.device based on the current machine. + This currently only supports CPU, CUDA, NPU. + Returns: + device + """ + if get_device_name() == "cuda": + return dict(num_gpus=0.1) + elif get_device_name() == "npu": + return dict(resources={"NPU": 0.1}) + return dict(num_cpus=0.1) + + +@ray.remote +class TestActor(Worker): + # TODO: pass *args and **kwargs is bug prone and not very convincing + def __init__(self, x) -> None: + super().__init__() + self._x = x + + def foo(self, y): + return self._x + y + + @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO) + def foo_rank_zero(self, x, y): + return self._x + y + x + + @register(Dispatch.ONE_TO_ALL, blocking=False) + def foo_one_to_all(self, x, y): + return self._x + y + x + + @register(Dispatch.ALL_TO_ALL, blocking=False) + def foo_all_to_all(self, x, y): + return self._x + y + x + + @register(dispatch_mode={"dispatch_fn": two_to_all_dispatch_fn, "collect_fn": collect_all_to_all}) + def foo_custom(self, x, y): + return self._x + y + x + + +@ray.remote(**get_ray_remote_options()) +def remote_call_wg(worker_names): + class_with_args = RayClassWithInitArgs(cls=TestActor, x=2) + worker_group = RayWorkerGroup.from_detached( + worker_names=worker_names, ray_cls_with_init=class_with_args, name_prefix=None + ) + print(worker_group.worker_names) + + output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6]) + assert output_ref == [8, 10, 8, 10] + + output_ref = worker_group.foo_rank_zero(x=1, y=2) + assert output_ref == 5 + + return worker_group.worker_names + + +def add_one(data): + data = data.to(get_device_name()) + data += 1 + data = data.to("cpu") + return data + + +def test_basics(): + ray.init(num_cpus=100) + + # create 4 workers, each hold a GPU + resource_pool = RayResourcePool([4], use_gpu=True) + class_with_args = RayClassWithInitArgs(cls=TestActor, x=2) + + worker_group = RayWorkerGroup( + resource_pool=resource_pool, + ray_cls_with_init=class_with_args, + name_prefix="worker_group_basic", + device_name=get_device_name(), + ) + + print(worker_group.worker_names) + + # this will wait for all the results + output = worker_group.execute_all_sync("foo", y=3) + assert output == [5, 5, 5, 5] + + # this is a list of object reference. It won't block. + output_ref = worker_group.execute_all_async("foo", y=4) + print(output_ref) + + assert ray.get(output_ref) == [6, 6, 6, 6] + + output_ref = worker_group.foo_one_to_all(x=1, y=2) + assert ray.get(output_ref) == [5, 5, 5, 5] + + output_ref = worker_group.foo_all_to_all(x=[1, 2, 3, 4], y=[5, 6, 7, 8]) + assert ray.get(output_ref) == [8, 10, 12, 14] + + print(ray.get(remote_call_wg.remote(worker_group.worker_names))) + + output = worker_group.execute_func_rank_zero(add_one, torch.ones(2, 2)) + torch.testing.assert_close(output, torch.ones(2, 2) + 1) + + ray.shutdown() + + +if __name__ == "__main__": + test_basics() diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/README.md b/code/RL_model/verl/verl_train/tests/special_distributed/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f2f865e8bf95a673a0d6f56b74c7a2c12535faf2 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_distributed/README.md @@ -0,0 +1 @@ +This folder is reserved for unit tests (instead of end-to-end tests) that require multiple GPUs. diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/run_all.sh b/code/RL_model/verl/verl_train/tests/special_distributed/run_all.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d6c5c71e54a1d6000025840b1abc783f56b60d5 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_distributed/run_all.sh @@ -0,0 +1,19 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env bash + +set -e -x +torchrun --nproc-per-node=4 --standalone tests/special_distributed/test_tensor_dict.py +torchrun --nproc-per-node=4 --standalone tests/special_distributed/test_torch_functional.py diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_fsdp_ckpt.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_fsdp_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9b497c47cb9359efb6c9c598391ffb0493cb40 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_fsdp_ckpt.py @@ -0,0 +1,165 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import shutil +import tempfile + +import torch +import torch.distributed +from torch.distributed import init_device_mesh +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import MixedPrecision, ShardingStrategy +from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2Config + +from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager +from verl.utils.device import get_device_name, get_torch_device +from verl.utils.distributed import initialize_global_process_group +from verl.utils.fsdp_utils import MixedPrecisionPolicy, apply_fsdp2 + + +def create_random_input_ids(batch_size, seq_len, vocab_size): + if get_device_name() == "cuda": + from flash_attn.bert_padding import unpad_input + elif get_device_name() == "npu": + from verl.utils.attention_utils import unpad_input + from verl.utils.model import compute_position_id_with_mask, create_random_mask + + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=get_device_name()) + + attention_mask = create_random_mask( + input_ids, max_ratio_of_left_padding=0.1, min_ratio_of_valid_token=0.5, max_ratio_of_valid_token=0.7 + ) + position_ids = compute_position_id_with_mask(attention_mask) + + input_ids = unpad_input(input_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1) + position_ids = unpad_input(position_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1) + return input_ids, position_ids + + +def test_fsdp_ckpt(strategy="fsdp"): + assert get_torch_device().device_count() >= 2, "need at least 2 gpus for test" + local_rank, rank, world_size = initialize_global_process_group() + device_mesh = init_device_mesh(get_device_name(), mesh_shape=(world_size,), mesh_dim_names=("dp",)) + + model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct") + config = Qwen2Config(num_hidden_layers=1) + + with torch.device(get_device_name()): + model = AutoModelForCausalLM.from_config( + config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model = model.to(device=get_device_name()) + + # Wrap model with FSDP + if strategy == "fsdp": + mixed_precision = MixedPrecision( + param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32 + ) + + model = FSDP( + model, + use_orig_params=False, + device_id=get_torch_device().current_device(), + sharding_strategy=ShardingStrategy.FULL_SHARD, + mixed_precision=mixed_precision, + device_mesh=device_mesh, + ) + else: + mp_policy = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, reduce_dtype=torch.float32, cast_forward_inputs=True + ) + fsdp_kwargs = { + "mesh": device_mesh, + "mp_policy": mp_policy, + } + apply_fsdp2(model, fsdp_kwargs, {}) + + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) + + # Create checkpoint manager + tokenizer = AutoTokenizer.from_pretrained(model_name) + checkpoint_manager = FSDPCheckpointManager( + model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, tokenizer=tokenizer + ) + + # Generate sample input + batch_size = 10 + seq_len = 1024 + vocab_size = config.vocab_size + # First input for initial update + input_ids1, position_ids1 = create_random_input_ids(batch_size, seq_len, vocab_size) + + # Second input for verification + input_ids2, position_ids2 = create_random_input_ids(batch_size, seq_len, vocab_size) + + # Step 1: Initial update and save checkpoint + outputs1 = model(input_ids=input_ids1, position_ids=position_ids1) + loss1 = outputs1.logits.mean() + loss1.backward() + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Save checkpoint after first update + temp_dir = tempfile.mkdtemp() + checkpoint_path = os.path.join(temp_dir, "checkpoint") + checkpoint_manager.save_checkpoint(local_path=checkpoint_path, hdfs_path=None, global_step=0) + saved_state_dict = model.state_dict() + + # Step 2: Second update and forward pass + outputs2 = model(input_ids=input_ids2, position_ids=position_ids2) + loss2 = outputs2.logits.mean() + loss2.backward() + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Record logits after second update + with torch.no_grad(): + logits_before_load = model(input_ids=input_ids2, position_ids=position_ids2).logits + + # Step 3: Load checkpoint and repeat second update + checkpoint_manager.load_checkpoint(checkpoint_path) + loaded_state_dict = model.state_dict() + for key in loaded_state_dict: + assert key in saved_state_dict, f"Key {key} not found in saved state dict" + torch.testing.assert_close(loaded_state_dict[key], saved_state_dict[key], atol=0.0, rtol=0.0) + + # Repeat the second update with same input + outputs3 = model(input_ids=input_ids2, position_ids=position_ids2) + loss3 = outputs3.logits.mean() + loss3.backward() + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Record logits after loaded checkpoint and update + with torch.no_grad(): + logits_after_load = model(input_ids=input_ids2, position_ids=position_ids2).logits + + # Step 4: Verify outputs match + torch.testing.assert_close(logits_before_load, logits_after_load, atol=0.0, rtol=0.0) + print("Checkpoint save/load test passed!") + + # Cleanup + shutil.rmtree(temp_dir) + torch.distributed.barrier() + torch.distributed.destroy_process_group() + + +if __name__ == "__main__": + strategy = os.environ.get("STRATEGY", "fsdp") + os.environ["FLASH_ATTENTION_DETERMINISTIC"] = "1" + test_fsdp_ckpt(strategy=strategy) diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_mcore_config_converter.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_mcore_config_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..d8f24c49911ed7b1fb1d73740dfc150e57dade0d --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_mcore_config_converter.py @@ -0,0 +1,100 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import megatron.core.parallel_state as mpu +import torch +from megatron.core.transformer import MLATransformerConfig, TransformerConfig +from transformers import AutoConfig, PretrainedConfig + +from verl.models.mcore import hf_to_mcore_config +from verl.utils.distributed import destroy_global_process_group, initialize_global_process_group + +TEST_MODELS = [ + "Qwen/Qwen2.5-7B", # Qwen2 dense + "Qwen/Qwen3-8B", # Qwen3 dense + "deepseek-ai/deepseek-coder-1.3b-instruct", # deepseek dense + "Qwen/Qwen2-57B-A14B", # Qwen2 moe + "Qwen/Qwen3-30B-A3B", # Qwen3 moe + # "mistralai/Mixtral-8x7B-v0.1", # Mixtral # require authentication + "deepseek-ai/DeepSeek-V3-Base", # Deepseek V3 +] + + +def check_config_converter_results(tf_config: TransformerConfig | MLATransformerConfig, hf_config: PretrainedConfig): + assert tf_config.num_layers == hf_config.num_hidden_layers, ( + f"Number of layers mismatch: {tf_config.num_layers} != {hf_config.num_hidden_layers}" + ) + assert tf_config.hidden_size == hf_config.hidden_size, ( + f"Hidden size mismatch: {tf_config.hidden_size} != {hf_config.hidden_size}" + ) + assert tf_config.num_attention_heads == hf_config.num_attention_heads, ( + f"Number of attention heads mismatch: {tf_config.num_attention_heads} != {hf_config.num_attention_heads}" + ) + assert tf_config.num_query_groups == hf_config.num_key_value_heads, ( + f"Number of query groups mismatch: {tf_config.num_query_groups} != {hf_config.num_key_value_heads}" + ) + assert tf_config.ffn_hidden_size == hf_config.intermediate_size, ( + f"FFN hidden size mismatch: {tf_config.ffn_hidden_size} != {hf_config.intermediate_size}" + ) + assert tf_config.attention_dropout == hf_config.attention_dropout, ( + f"Attention dropout mismatch: {tf_config.attention_dropout} != {hf_config.attention_dropout}" + ) + assert tf_config.hidden_dropout == getattr(hf_config, "hidden_dropout", 0.0), ( + f"Hidden dropout mismatch: {tf_config.hidden_dropout} != {getattr(hf_config, 'hidden_dropout', 0.0)}" + ) + if getattr(hf_config, "head_dim", None) is not None: + assert tf_config.kv_channels == getattr(hf_config, "head_dim", None), ( + f"Head dim mismatch: {tf_config.kv_channels} != {getattr(hf_config, 'head_dim', None)}" + ) + assert tf_config.layernorm_epsilon == hf_config.rms_norm_eps, ( + f"Layernorm epsilon mismatch: {tf_config.layernorm_epsilon} != {hf_config.rms_norm_eps}" + ) + + +def modify_hf_config(name: str, hf_config: PretrainedConfig): + if name == "deepseek-ai/DeepSeek-V3-Base": + hf_config.num_nextn_predict_layers = 0 + hf_config.quantization_config = None + return hf_config + + +def test_mcore_config_converter(): + """ + Test the conversion of Hugging Face model configurations to MCore configurations. + """ + local_rank, rank, world_size = initialize_global_process_group() + mpu.initialize_model_parallel( + tensor_model_parallel_size=2, + pipeline_model_parallel_size=2, + virtual_pipeline_model_parallel_size=None, + use_sharp=False, + context_parallel_size=2, + expert_model_parallel_size=1, + expert_tensor_parallel_size=None, + nccl_communicator_config_path=None, + ) + for model_name in TEST_MODELS: + print(f"testing {model_name}") + hf_config = AutoConfig.from_pretrained(os.path.expanduser(f"~/models/configs/{model_name}/config.json")) + hf_config = modify_hf_config(model_name, hf_config) + tf_config = hf_to_mcore_config(hf_config, torch.bfloat16) + check_config_converter_results(tf_config, hf_config) + + destroy_global_process_group() + + +if __name__ == "__main__": + test_mcore_config_converter() diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_tensor_dict.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_tensor_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..565f8a8120845cddb8e166eb9f08f181dc2b6cff --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_tensor_dict.py @@ -0,0 +1,126 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["NCCL_DEBUG"] = "WARN" + +import numpy as np +import torch +import torch.distributed + +from verl.protocol import DataProto, all_gather_data_proto +from verl.utils.device import get_device_name +from verl.utils.distributed import initialize_global_process_group + + +def test_all_gather_data_proto(): + device_mesh = torch.distributed.device_mesh.init_device_mesh( + get_device_name(), mesh_shape=[2, 2], mesh_dim_names=["dp", "tp"] + ) + + global_rank = torch.distributed.get_rank() + + obs = torch.tensor([[1 * global_rank, 2 * global_rank + 1], [3 * global_rank, 4 * global_rank + 1]]) + + labels = ["a", "b"] if global_rank % 2 == 0 else ["b", "a"] + labels = np.array(labels, dtype=object) + data = DataProto.from_dict(tensors={"obs": obs}, non_tensors={"labels": labels}, meta_info={"info": "test_info"}) + + all_gather_data_proto(data=data, process_group=device_mesh.get_group("dp")) + + if global_rank == 0: + expected_obs = torch.tensor([[0, 1], [0, 1], [2, 5], [6, 9]], device=get_device_name()) + expected_labels = ["a", "b", "a", "b"] + elif global_rank == 1: + expected_obs = torch.tensor([[1, 3], [3, 5], [3, 7], [9, 13]], device=get_device_name()) + expected_labels = ["b", "a", "b", "a"] + elif global_rank == 2: + expected_obs = torch.tensor([[0, 1], [0, 1], [2, 5], [6, 9]], device=get_device_name()) + expected_labels = ["a", "b", "a", "b"] + elif global_rank == 3: + expected_obs = torch.tensor([[1, 3], [3, 5], [3, 7], [9, 13]], device=get_device_name()) + expected_labels = ["b", "a", "b", "a"] + + torch.testing.assert_close(data.batch["obs"], expected_obs, atol=0, rtol=0) + assert (data.non_tensor_batch["labels"] == expected_labels).all() + assert data.meta_info == {"info": "test_info"} + + +def test_vocab_parallel_entropy(): + from megatron.core import parallel_state as mpu + + from verl.utils.megatron.tensor_parallel import vocab_parallel_entropy + from verl.utils.profiler import log_gpu_memory_usage + from verl.utils.torch_functional import entropy_from_logits + + mpu.initialize_model_parallel( + tensor_model_parallel_size=2, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None + ) + + batch_size = 2 + seqlen = 128 + vocab_size = 155136 + + logits = torch.randn(batch_size * seqlen, vocab_size, device=get_device_name(), requires_grad=True) + target = torch.randint( + low=0, high=vocab_size, size=(batch_size * seqlen,), device=get_device_name(), dtype=torch.int64 + ) + + # broadcast across tp + torch.distributed.broadcast( + logits, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group() + ) + torch.distributed.broadcast( + target, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group() + ) + + tp_rank = mpu.get_tensor_model_parallel_rank() + vocab_size_per_tp = vocab_size // mpu.get_tensor_model_parallel_world_size() + + # get the local logits of each tp + vocab_parallel_logits = ( + logits.clone().detach()[:, tp_rank * vocab_size_per_tp : (tp_rank + 1) * vocab_size_per_tp].requires_grad_() + ) + logits.grad = None + vocab_parallel_logits.grad = None + + log_gpu_memory_usage("begin") + output_entropy = vocab_parallel_entropy(vocab_parallel_logits) + log_gpu_memory_usage("after forward") + grad_output = torch.randn_like(output_entropy) + output_entropy.backward(grad_output) + log_gpu_memory_usage("after backward") + + target_entropy = entropy_from_logits(logits) + torch.testing.assert_close(output_entropy, target_entropy) + target_entropy.backward(grad_output) + torch.testing.assert_close( + logits.grad[:, tp_rank * vocab_size_per_tp : (tp_rank + 1) * vocab_size_per_tp], vocab_parallel_logits.grad + ) + # make sure logits is not altered + torch.testing.assert_close( + logits[:, tp_rank * vocab_size_per_tp : (tp_rank + 1) * vocab_size_per_tp], vocab_parallel_logits + ) + + if mpu.get_tensor_model_parallel_rank() == 0: + print("test_vocab_parallel_entropy passes") + + mpu.destroy_model_parallel() + + +if __name__ == "__main__": + local_rank, rank, world_size = initialize_global_process_group() + test_all_gather_data_proto() + test_vocab_parallel_entropy() diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_torch_functional.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_torch_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..d07d335f5a313e6557e72e2331c88176486fc016 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_torch_functional.py @@ -0,0 +1,35 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch + +from verl.utils.torch_functional import allgather_dict_into_dict + +if __name__ == "__main__": + torch.distributed.init_process_group(backend="gloo") + + local_rank = int(os.environ["LOCAL_RANK"]) + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + + metrics_dict = {"loss": [0 + rank, 1 + rank, 2 + rank], "grad_norm": rank} + + result = allgather_dict_into_dict(data=metrics_dict, group=None) + + assert result["loss"] == [[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5]] + assert result["grad_norm"] == [0, 1, 2, 3] + + print(result) diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_api_docs.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_api_docs.py new file mode 100644 index 0000000000000000000000000000000000000000..03756f2d284ddcb58b41e068b4abd560b2d074f7 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_api_docs.py @@ -0,0 +1,142 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Fail CI if any function or class that is publicly exported via +``__all__`` lacks a docstring. + +Usage +----- + # Check specific modules or packages + python check_docstrings.py mypkg.core mypkg.utils + + # Check an entire source tree (all top-level packages under cwd) + python check_docstrings.py +""" + +from __future__ import annotations + +import argparse +import importlib +import inspect +import pkgutil +import sys +from pathlib import Path +from types import ModuleType +from typing import Iterable + +_ALLOW_LIST = [ + "verl.third_party.vllm.LLM", + "verl.third_party.vllm.parallel_state", + "verl.utils.profiler.WorkerProfiler", + "verl.utils.profiler.WorkerProfilerExtension", + "verl.utils.profiler.log_gpu_memory_usage", + "verl.utils.profiler.log_print", + "verl.utils.profiler.mark_annotate", + "verl.utils.profiler.mark_end_range", + "verl.utils.profiler.mark_start_range", + "verl.models.mcore.qwen2_5_vl.get_vision_model_config", + "verl.models.mcore.qwen2_5_vl.get_vision_projection_config", + "verl.models.mcore.mbridge.freeze_moe_router", + "verl.models.mcore.mbridge.make_value_model", + "verl.utils.transformers_compat.flash_attn_supports_top_left_mask", +] + + +def iter_submodules(root: ModuleType) -> Iterable[ModuleType]: + """Yield *root* and every sub-module inside it.""" + yield root + + def print_pkg_error(pkg_name): + print(f"[warn] Skipping {pkg_name!r}", file=sys.stderr) + + if getattr(root, "__path__", None): # only packages have __path__ + for mod_info in pkgutil.walk_packages(root.__path__, prefix=f"{root.__name__}.", onerror=print_pkg_error): + try: + yield importlib.import_module(mod_info.name) + except Exception as exc: + print(f"[warn] Skipping {mod_info.name!r}: {exc}", file=sys.stderr) + + +def names_missing_doc(mod: ModuleType) -> list[str]: + """Return fully-qualified names that need docstrings.""" + missing: list[str] = [] + public = getattr(mod, "__all__", []) + for name in public: + obj = getattr(mod, name, None) + if f"{mod.__name__}.{name}" in _ALLOW_LIST: + continue + if obj is None: + # Exported but not found in the module: flag it anyway. + missing.append(f"{mod.__name__}.{name} (not found)") + continue + + if inspect.isfunction(obj) or inspect.isclass(obj): + doc = inspect.getdoc(obj) + if not doc or not doc.strip(): + missing.append(f"{mod.__name__}.{name}") + return missing + + +def check_module(qualname: str) -> list[str]: + """Import *qualname* and check it (and sub-modules).""" + try: + module = importlib.import_module(qualname) + except ModuleNotFoundError as exc: + print(f"[error] Cannot import '{qualname}': {exc}", file=sys.stderr) + return [qualname] + + missing: list[str] = [] + for submod in iter_submodules(module): + missing.extend(names_missing_doc(submod)) + return missing + + +def autodiscover_packages() -> list[str]: + """Detect top-level packages under CWD when no argument is given.""" + pkgs: list[str] = [] + for p in Path.cwd().iterdir(): + if p.is_dir() and (p / "__init__.py").exists(): + pkgs.append(p.name) + return pkgs + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "modules", + nargs="*", + help="Fully-qualified module or package names (defaults to every top-level package found in CWD).", + ) + args = parser.parse_args() + + targets = args.modules or autodiscover_packages() + if not targets: + raise ValueError("[error] No modules specified and none detected automatically.") + + all_missing: list[str] = [] + for modname in targets: + all_missing.extend(check_module(modname)) + + if all_missing: + print("\nMissing docstrings:") + for name in sorted(all_missing): + print(f" - {name}") + raise ValueError("Missing docstrings detected. Please enhance them with docs accordingly.") + + print("✅ All exported functions/classes have docstrings.") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_dataproto_usage.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_dataproto_usage.py new file mode 100644 index 0000000000000000000000000000000000000000..7c8521ab12e0fc2f39dd965d3aefbb4f303c12c9 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_dataproto_usage.py @@ -0,0 +1,69 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This CI test is used for checking whether DataProto is used in the code of some directory +""" + +import os +from argparse import ArgumentParser +from pathlib import Path + +SEARCH_WHITELIST = [] + +SEARCH_KEYWORDS = ["DataProto"] + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--directory", "-d", required=True, type=str) + args = parser.parse_args() + directory_in_str = args.directory + + pathlist = Path(directory_in_str).glob("**/*.py") + for path in pathlist: + path_in_str = str(path.absolute()) + + # judge whether current path is in pre-defined search whitelist or not. + path_in_whitelist = False + + for sw in SEARCH_WHITELIST: + # for easy debugging in non-linux system + sw = sw.replace("/", os.sep) + if sw in path_in_str: + print(f"[SKIP] File {path_in_str} is in device api usage check whitelist, checking is skipped.") + path_in_whitelist = True + break + + if path_in_whitelist: + continue + + with open(path_in_str, encoding="utf-8") as f: + file_content = f.read() + + find_invalid_device_management = False + + for sk in SEARCH_KEYWORDS: + if sk in file_content: + find_invalid_device_management = True + break + + print( + f"[CHECK] File {path_in_str} is detected for DataProto usage check, check result: " + f"{'success' if not find_invalid_device_management else f'failed, because detect {sk}'}." + ) + + assert not find_invalid_device_management, ( + f"file {path_in_str} contains DataProto usage, please use TensorDict directly!" + ) diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_device_api_usage.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_device_api_usage.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf9cf7e75a0cff068d87e1d369d8f7600306db1 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_device_api_usage.py @@ -0,0 +1,107 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This CI test is used for checking whether device api usage is irregular, suggest using api in `verl/utils/device.py`. +Search targets include .py files in verl/recipe and verl/verl. +Some files that must contain ".cuda", "cuda" or "nccl" keyword is pre-defined in whitelist below. +""" + +import os +from argparse import ArgumentParser +from pathlib import Path + +# directory or file path must contain keyword ".cuda" or "cuda" +CUDA_KEYWORD_CHECK_WHITELIST = [ + "verl/utils/device.py", + "verl/utils/torch_functional.py", # import flash_attn only on cuda + "verl/utils/profiler/nvtx_profile.py", # appear in NsightSystemsProfiler + "verl/utils/profiler/torch_profile.py", # appear in TorchProfiler + "verl/utils/profiler/config.py", # appear in TorchProfilerToolConfig + "verl/utils/kernel/linear_cross_entropy.py", # appear in nvidia nvtx + "verl/utils/rendezvous/ray_backend.py", # appear in cupy importance + "verl/single_controller/ray/base.py", # appear in default device_name + "verl/trainer/ppo/ray_trainer.py", # appear in default device_name + "verl/experimental/transfer_queue/ray_trainer.py", # appear in docstring as default device_name + "verl/experimental/one_step_off_policy/ray_trainer.py", # appear in docstring as default device_name + "verl/utils/reward_score/sandbox_fusion/utils.py", # appear in sandbox language type + "verl/workers/reward_model/megatron/reward_model.py", # appear in default device_name + "verl/third_party/torch/distributed/_state_dict_utils.py", # torch monkey patch fixes + "verl/third_party/torch/distributed/checkpoint/state_dict.py", # torch monkey patch fixes + "verl/workers/engine/base.py", # appear in default device_name + "verl/workers/engine/utils.py", # appear in enable_full_determinism + "verl/workers/engine/fsdp/transformer_impl.py", # appear in default device_name + "verl/workers/engine/veomni/transformer_impl.py", # appear in default device_name + "verl/workers/rollout/vllm_rollout/vllm_async_server.py", # appear in config.cudagraph_capture_sizes + "verl/workers/rollout/sglang_rollout/async_sglang_server.py", # manually set CUDA_VISIBLE_DEVICES + "verl/workers/rollout/trtllm_rollout/trtllm_async_server.py", # appear in config.cudagraph_capture_sizes + "verl/workers/rollout/replica.py", # appear in default device_name + "verl/checkpoint_engine", # checkpoint engine backend are device specific +] + +# directory or file path must contain keyword "nccl" +NCCL_KEYWORD_CHECK_WHITELIST = [ + "verl/utils/device.py", + "verl/third_party/sglang/parallel_state.py", # appear in default backend + "verl/recipe/fully_async_policy/param_sync.py", # fully_async_policy in default backend +] + +SEARCH_WHITELIST = CUDA_KEYWORD_CHECK_WHITELIST + NCCL_KEYWORD_CHECK_WHITELIST + +SEARCH_KEYWORDS = [".cuda", '"cuda"', '"nccl"'] + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--directory", "-d", required=True, type=str) + args = parser.parse_args() + directory_in_str = args.directory + + pathlist = Path(directory_in_str).glob("**/*.py") + for path in pathlist: + path_in_str = str(path.absolute()) + + # judge whether current path is in pre-defined search whitelist or not. + path_in_whitelist = False + + for sw in SEARCH_WHITELIST: + # for easy debugging in non-linux system + sw = sw.replace("/", os.sep) + if sw in path_in_str: + print(f"[SKIP] File {path_in_str} is in device api usage check whitelist, checking is skipped.") + path_in_whitelist = True + break + + if path_in_whitelist: + continue + + with open(path_in_str, encoding="utf-8") as f: + file_content = f.read() + + find_invalid_device_management = False + + for sk in SEARCH_KEYWORDS: + if sk in file_content: + find_invalid_device_management = True + break + + print( + f"[CHECK] File {path_in_str} is detected for device api usage check, check result: " + f"{'success' if not find_invalid_device_management else f'failed, because detect {sk}'}." + ) + + assert not find_invalid_device_management, ( + f'file {path_in_str} contains .cuda/"cuda"/"nccl" usage, please use api in ' + f"verl/utils/device.py directly." + ) diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_docs_time_info.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_docs_time_info.py new file mode 100644 index 0000000000000000000000000000000000000000..a54d1d50a7e9d21202387e2c9c8e3c6c73a5d807 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_docs_time_info.py @@ -0,0 +1,84 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Check that every .md and .rst file under docs/ contains the substring "Last updated", +with an allow-list for exceptions. +""" + +import sys +from pathlib import Path + +# === CONFIGURATION === + +# Relative paths (to docs/) or glob patterns to skip checking +ALLOW_LIST = { + "docs/README.md", # you can list individual files + "docs/legacy/*.rst", # or glob patterns + "docs/index.rst", + "docs/start/install.rst", + "docs/start/quickstart.rst", + "docs/README_vllm0.7.md", +} + +# The folder to scan +DOCS_DIR = Path("docs") + +# === SCRIPT === + + +def is_allowed(path: Path) -> bool: + """ + Return True if `path` matches any entry in ALLOW_LIST. + """ + rel = str(path) + for pattern in ALLOW_LIST: + if Path(rel).match(pattern): + return True + return False + + +def main(): + if not DOCS_DIR.exists(): + print(f"Error: Documentation directory '{DOCS_DIR}' does not exist.", file=sys.stderr) + sys.exit(1) + + missing = [] + + # Gather all .md and .rst files under docs/ + for ext in ("*.md", "*.rst"): + for path in DOCS_DIR.rglob(ext): + if is_allowed(path): + continue + + text = path.read_text(encoding="utf-8", errors="ignore") + if "Last updated" not in text: + missing.append(path) + + # Report + if missing: + print("\nThe following files are missing the 'Last updated' string:\n") + for p in missing: + print(f" - {p}") + print(f"\nTotal missing: {len(missing)}\n", file=sys.stderr) + raise AssertionError( + "Some documentation files lack a 'Last updated' line. Please include info such as " + "'Last updated: mm/dd/yyyy' to indicate the last update time of the document." + ) + else: + print("✅ All checked files contain 'Last updated'.") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_docstrings.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_docstrings.py new file mode 100644 index 0000000000000000000000000000000000000000..222ebef4997588257ebdf2e6ad88964ebcba78fc --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_docstrings.py @@ -0,0 +1,156 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Python script to check docstrings for functions and classes in specified files. +Checks that every public function and class has proper docstring documentation. +""" + +import ast +import os +import sys + + +class DocstringChecker(ast.NodeVisitor): + """AST visitor to check for missing docstrings in functions and classes.""" + + def __init__(self, filename: str): + self.filename = filename + self.missing_docstrings: list[tuple[str, str, int]] = [] + self.current_class = None + self.function_nesting_level = 0 + + def visit_FunctionDef(self, node: ast.FunctionDef): + """Visit function definitions and check for docstrings.""" + if not node.name.startswith("_") and self.function_nesting_level == 0: + if not self._has_docstring(node): + func_name = f"{self.current_class}.{node.name}" if self.current_class else node.name + self.missing_docstrings.append((func_name, self.filename, node.lineno)) + + self.function_nesting_level += 1 + self.generic_visit(node) + self.function_nesting_level -= 1 + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): + """Visit async function definitions and check for docstrings.""" + if not node.name.startswith("_") and self.function_nesting_level == 0: + if not self._has_docstring(node): + func_name = f"{self.current_class}.{node.name}" if self.current_class else node.name + self.missing_docstrings.append((func_name, self.filename, node.lineno)) + + self.function_nesting_level += 1 + self.generic_visit(node) + self.function_nesting_level -= 1 + + def visit_ClassDef(self, node: ast.ClassDef): + """Visit class definitions and check for docstrings.""" + if not node.name.startswith("_"): + if not self._has_docstring(node): + self.missing_docstrings.append((node.name, self.filename, node.lineno)) + + old_class = self.current_class + self.current_class = node.name + self.generic_visit(node) + self.current_class = old_class + + def _has_docstring(self, node) -> bool: + """Check if a node has a docstring.""" + return ast.get_docstring(node) is not None + + +def check_file_docstrings(filepath: str) -> list[tuple[str, str, int]]: + """Check docstrings in a single file.""" + try: + with open(filepath, encoding="utf-8") as f: + content = f.read() + + tree = ast.parse(content, filename=filepath) + checker = DocstringChecker(filepath) + checker.visit(tree) + return checker.missing_docstrings + + except Exception as e: + print(f"Error processing {filepath}: {e}") + return [] + + +def main(): + """Main function to check docstrings in specified files.""" + + files_to_check = [ + "verl/trainer/ppo/ray_trainer.py", + "verl/trainer/main_ppo.py", + "verl/trainer/ppo/reward.py", + "verl/utils/reward_score/__init__.py", + "verl/trainer/ppo/core_algos.py", + "verl/experimental/agent_loop/agent_loop.py", + "verl/workers/sharding_manager/fsdp_vllm.py", + "verl/workers/sharding_manager/fsdp_ulysses.py", + ] + + script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_path = os.path.dirname(os.path.dirname(script_dir)) + + if not os.path.exists(repo_path): + print(f"Repository path {repo_path} does not exist!") + sys.exit(1) + + os.chdir(repo_path) + + all_missing_docstrings = [] + + print("Checking docstrings in specified files...") + print("=" * 60) + + for file_path in files_to_check: + if not os.path.exists(file_path): + print(f"Warning: File {file_path} does not exist!") + continue + + print(f"Checking {file_path}...") + missing = check_file_docstrings(file_path) + all_missing_docstrings.extend(missing) + + if missing: + print(f" Found {len(missing)} missing docstrings") + else: + print(" All functions and classes have docstrings [OK]") + + print("=" * 60) + + if all_missing_docstrings: + print(f"\nSUMMARY: Found {len(all_missing_docstrings)} functions/classes missing docstrings:") + print("-" * 60) + + by_file = {} + for name, filepath, lineno in all_missing_docstrings: + if filepath not in by_file: + by_file[filepath] = [] + by_file[filepath].append((name, lineno)) + + for filepath in sorted(by_file.keys()): + print(f"\n{filepath}:") + for name, lineno in sorted(by_file[filepath], key=lambda x: x[1]): + print(f" - {name} (line {lineno})") + + print(f"\nTotal missing docstrings: {len(all_missing_docstrings)}") + + raise Exception(f"Found {len(all_missing_docstrings)} functions/classes without proper docstrings!") + + else: + print("\n[OK] All functions and classes have proper docstrings!") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_license.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_license.py new file mode 100644 index 0000000000000000000000000000000000000000..7cfa256b5f913841af65ac99975c52fe20ca3103 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_license.py @@ -0,0 +1,88 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from argparse import ArgumentParser +from pathlib import Path +from typing import Iterable + +license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates" +license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates" +license_head_bytedance_26 = "Copyright 2026 Bytedance Ltd. and/or its affiliates" +# Add custom license headers below +license_head_prime = "Copyright 2024 PRIME team and/or its affiliates" +license_head_individual = "Copyright 2025 Individual Contributor:" +license_head_sglang = "Copyright 2023-2024 SGLang Team" +license_head_modelbest = "Copyright 2025 ModelBest Inc. and/or its affiliates" +license_head_amazon = "Copyright 2025 Amazon.com Inc and/or its affiliates" +license_head_facebook = "Copyright (c) 2016- Facebook, Inc" +license_head_meituan = "Copyright 2025 Meituan Ltd. and/or its affiliates" +license_head_huawei = "Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved." +license_headers = [ + license_head_bytedance, + license_head_bytedance_25, + license_head_bytedance_26, + license_head_prime, + license_head_individual, + license_head_sglang, + license_head_modelbest, + license_head_amazon, + license_head_facebook, + license_head_meituan, + license_head_huawei, +] + + +def get_py_files(path_arg: Path) -> Iterable[Path]: + """get py files under a dir. if already py file return it + + Args: + path_arg (Path): path to scan for py files + + Returns: + Iterable[Path]: list of py files + """ + if path_arg.is_dir(): + return path_arg.glob("**/*.py") + elif path_arg.is_file() and path_arg.suffix == ".py": + return [path_arg] + return [] + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--directories", + "-d", + required=True, + type=Path, + nargs="+", + help="List of directories to check for license headers", + ) + args = parser.parse_args() + + # Collect all Python files from specified directories + pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg)) + + for path in pathlist: + # because path is object not string + path_in_str = str(path.absolute()) + print(path_in_str) + with open(path_in_str, encoding="utf-8") as f: + file_content = f.read() + + has_license = False + for lh in license_headers: + if lh in file_content: + has_license = True + break + assert has_license, f"file {path_in_str} does not contain license" diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_description.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_description.py new file mode 100644 index 0000000000000000000000000000000000000000..4ed4563db6088e8562273cebd08116e375bc8bb2 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_description.py @@ -0,0 +1,97 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +import json +import os + +# Number of lines to check +NUM_LINES = 5 + + +# Custom exception types for clear error handling +class TemplateFileError(Exception): + pass + + +class PRBodyLoadError(Exception): + pass + + +class PRDescriptionError(Exception): + pass + + +# Path to the PR template file +template_file = os.path.join(os.getenv("GITHUB_WORKSPACE", "."), ".github", "PULL_REQUEST_TEMPLATE.md") + + +def load_template(path): + """ + Load only the first NUM_LINES of the PR template file as a list of lines, + without stripping any characters. + """ + lines = [] + try: + with open(path, encoding="utf-8") as f: + for _ in range(NUM_LINES): + line = f.readline() + if not line: + break + lines.append(line.strip()) + return lines + except Exception as e: + raise TemplateFileError(f"Failed to read PR template (first {NUM_LINES} lines) at {path}: {e}") from e + + +def load_pr_body(event_path): + try: + with open(event_path, encoding="utf-8") as f: + payload = json.load(f) + return payload.get("pull_request", {}).get("body", "") or "" + except Exception as e: + raise PRBodyLoadError(f"Failed to read PR body from {event_path}: {e}") from e + + +def check_pr_description(body, template_lines): + """ + Compare the first NUM_LINES lines of the PR body to the template lines. + If they match exactly, the placeholder was not modified. + """ + pr_lines = body.splitlines(keepends=True) + pr_first = [x.strip() for x in pr_lines[:NUM_LINES]] + if pr_first == template_lines: + raise PRDescriptionError( + "It looks like you haven't updated the '### What does this PR do?' section. Please replace " + "the placeholder text with a concise description of what your PR does." + ) + else: + print(pr_first) + print(template_lines) + + +def main(): + event_path = os.getenv("GITHUB_EVENT_PATH") + if not event_path: + raise OSError("GITHUB_EVENT_PATH is not set.") + + template_lines = load_template(template_file) + pr_body = load_pr_body(event_path) + check_pr_description(pr_body, template_lines) + + print("✅ '### What does this PR do?' section has been filled out.") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_title.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_title.py new file mode 100644 index 0000000000000000000000000000000000000000..1153d9d77afa1c656cc5c8d9528a2016be002c1e --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_title.py @@ -0,0 +1,72 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re + +# Get PR title from environment +pr_title = os.environ.get("PR_TITLE", "").strip() + +# Define rules +allowed_modules = ["fsdp", "megatron", "veomni", "sglang", "vllm", "trtllm", "rollout", "trainer"] +allowed_modules += ["tests", "training_utils", "recipe", "hardware", "deployment"] +allowed_modules += ["ray", "worker", "single_controller", "misc", "docker", "ci"] +allowed_modules += ["perf", "model", "algo", "env", "tool", "ckpt", "doc", "data", "cfg", "reward"] +allowed_types = ["feat", "fix", "refactor", "chore", "test"] + +# Check for [1/N] prefix and extract the rest of the title +progress_match = re.match(r"^\[\d/[\dNn]\]\s*(.+)$", pr_title, re.IGNORECASE) +if progress_match: + pr_title = progress_match.group(1).strip() + +# Check for [BREAKING] prefix and extract the rest of the title +breaking_match = re.match(r"^\[BREAKING\]\s*(.+)$", pr_title, re.IGNORECASE) +if breaking_match: + core_pr_title = breaking_match.group(1).strip() + is_breaking = True +else: + core_pr_title = pr_title + is_breaking = False + +# Build dynamic regex pattern for modules (now working on core_pr_title) +re_modules_pattern = re.compile(r"^\[([a-z_,\s]+)\]", re.IGNORECASE) +re_modules = re_modules_pattern.match(core_pr_title) +if not re_modules: + print(f"❌ Invalid PR title: '{pr_title}'") + print("Expected format: [BREAKING][module] type: description") + print(f"Allowed modules: {', '.join(allowed_modules)}") + raise Exception("Invalid PR title") +else: + modules = re.findall(r"[a-z_]+", re_modules.group(1).lower()) + if not all(module in allowed_modules for module in modules): + invalid_modules = [module for module in modules if module not in allowed_modules] + print(f"❌ Invalid modules: {', '.join(invalid_modules)}") + print(f"Allowed modules: {', '.join(allowed_modules)}") + raise Exception("Invalid PR title") + +types_pattern = "|".join(re.escape(t) for t in allowed_types) +re_types_pattern = re.compile(rf"^\[[a-z_,\s]+\]\s+({types_pattern}):\s+.+$", re.IGNORECASE) +match = re_types_pattern.match(core_pr_title) + +if not match: + print(f"❌ Invalid PR title: '{pr_title}'") + print("Expected format: [BREAKING][module] type: description") + print(f"Allowed types: {', '.join(allowed_types)}") + raise Exception("Invalid PR title") + +change_type = match.group(1).lower() + +# Build the success message +breaking_info = " (BREAKING CHANGE)" if is_breaking else "" +print(f"✅ PR title is valid: {pr_title}, modules: {modules}, type: {change_type}{breaking_info}") diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/test_config_docs.py b/code/RL_model/verl/verl_train/tests/special_sanity/test_config_docs.py new file mode 100644 index 0000000000000000000000000000000000000000..b8dc74762450fe41a42db6ca09972851e8dcbdc2 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/test_config_docs.py @@ -0,0 +1,88 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from pathlib import Path + + +def validate_yaml_format(yaml_lines): + errors = [] + i = 0 + + while i < len(yaml_lines): + line = yaml_lines[i] + stripped = line.strip() + + # Skip empty lines + if stripped == "": + i += 1 + continue + + # Match YAML keys like "field:" or "field: value" + key_match = re.match(r"^(\s*)([a-zA-Z0-9_]+):", line) + if key_match: + # Check if there's a comment above + if i == 0 or not yaml_lines[i - 1].strip().startswith("#"): + errors.append(f"Missing comment above line {i + 1}: {line.strip()}") + + # Check for inline comment + if "#" in line and not stripped.startswith("#"): + comment_index = line.index("#") + colon_index = line.index(":") + if comment_index > colon_index: + errors.append(f"Inline comment found on line {i + 1}: {line.strip()}") + + # Check for blank line after this key line (unless next is a deeper indent) + if i + 1 < len(yaml_lines): + next_line = yaml_lines[i + 1] + next_stripped = next_line.strip() + + # If next is not empty and not a deeper nested line, enforce blank line + if next_stripped != "": + errors.append(f"Missing blank line after line {i + 1}: {line.strip()}") + + i += 1 + + return errors + + +def test_trainer_config_doc(): + yamls_to_inspect = [ + "verl/trainer/config/ppo_trainer.yaml", + "verl/trainer/config/actor/actor.yaml", + "verl/trainer/config/actor/dp_actor.yaml", + "verl/trainer/config/critic/critic.yaml", + "verl/trainer/config/critic/dp_critic.yaml", + "verl/trainer/config/ref/ref.yaml", + "verl/trainer/config/ref/dp_ref.yaml", + "verl/trainer/config/rollout/rollout.yaml", + ] + success = True + for yaml_to_inspect in yamls_to_inspect: + yaml_path = Path(yaml_to_inspect) # path to your YAML file + with open(yaml_path) as f: + lines = f.readlines() + + validation_errors = validate_yaml_format(lines) + if validation_errors: + success = False + print("YAML documentation format check failed:") + print(f"Please read the top block of {yaml_to_inspect} to see format rules:\n") + for err in validation_errors: + print(" -", err) + + if not success: + raise Exception("Please fix documentation format.") + else: + print("YAML format check passed ✅") diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/test_import.py b/code/RL_model/verl/verl_train/tests/special_sanity/test_import.py new file mode 100644 index 0000000000000000000000000000000000000000..4f8a918fe65679c353e8055c2c2b0a428fdf8f7a --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/test_import.py @@ -0,0 +1,25 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_import(): + import verl + + print(verl.__version__) + + +def test_single_controller_import(): + import verl.single_controller + + print(verl.single_controller.__version__) diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/type_coverage_check.py b/code/RL_model/verl/verl_train/tests/special_sanity/type_coverage_check.py new file mode 100644 index 0000000000000000000000000000000000000000..c35abaeb2d6d4fb23f0d2c58b4dde56986932a37 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/type_coverage_check.py @@ -0,0 +1,182 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom type annotation check tool. +To inspect the type annotation for functions in the entire codebase, please run: +find verl -type f -name "*.py" | xargs -n 1 python3 tests/special_sanity/type_coverage_check.py --all-lines +--debug --target-file +""" + +import argparse +import ast +import linecache +import subprocess +from pathlib import Path + + +def get_changed_files() -> list[Path]: + result = subprocess.run( + ["git", "diff", "--name-only", "--diff-filter=AM", "origin/main...HEAD"], stdout=subprocess.PIPE, text=True + ) + return [Path(f) for f in result.stdout.splitlines() if f.endswith(".py")] + + +def get_changed_lines(file_path: Path) -> set[int]: + result = subprocess.run( + ["git", "diff", "-U0", "origin/main...HEAD", "--", str(file_path)], + stdout=subprocess.PIPE, + text=True, + ) + lines: set[int] = set() + for line in result.stdout.splitlines(): + if line.startswith("@@"): + for part in line.split(): + try: + if part.startswith("+") and "," in part: + start, count = map(int, part[1:].split(",")) + lines.update(range(start, start + count)) + elif part.startswith("+") and "," not in part: + lines.add(int(part[1:])) + except Exception: + # (vermouth1992) There are many edge cases here because + can be in the changed program + pass + return lines + + +CHECK_SUCCESS = 0 +CHECK_WARNING = 1 +CHECK_FAILURE = -1 + + +def should_check_type(arg_name: str) -> bool: + if arg_name in ("self", "cls"): + return False + if arg_name.startswith("*"): + return False + return True + + +def has_type_annotations(node: ast.AST, debug: bool = False) -> int: + if isinstance(node, ast.FunctionDef): + is_private = node.name.startswith("_") + if node.args.vararg is not None or node.args.kwarg is not None: + return CHECK_SUCCESS + has_ann = ( + all(arg.annotation is not None for arg in node.args.args if should_check_type(arg.arg)) + and node.returns is not None + ) + if has_ann or is_private: + return CHECK_SUCCESS + else: + if debug: + print(node, [(arg.annotation, arg.arg) for arg in node.args.args if should_check_type(arg.arg)]) + return CHECK_FAILURE + return CHECK_SUCCESS + + +def check_file( + file_path: Path, changed_lines: set[int], debug: bool = False +) -> tuple[int, int, list[tuple[Path, int, str]], list[tuple[Path, int, str]]]: + with open(file_path) as f: + source: str = f.read() + tree = ast.parse(source, filename=str(file_path)) + annotated = 0 + total = 0 + warning_lines: list[tuple[Path, int, str]] = [] + failure_lines: list[tuple[Path, int, str]] = [] + + for node in ast.walk(tree): + if hasattr(node, "lineno") and node.lineno in changed_lines: + if isinstance(node, ast.FunctionDef | ast.Assign | ast.AnnAssign): + total += 1 + result = has_type_annotations(node, debug) + if result == CHECK_SUCCESS or result == CHECK_WARNING: + annotated += 1 + if result == CHECK_WARNING: + warning_lines.append( + (file_path, node.lineno, linecache.getline(str(file_path), node.lineno).strip()) + ) + else: + source_line = linecache.getline(str(file_path), node.lineno).strip() + failure_lines.append((file_path, node.lineno, source_line)) + + return annotated, total, warning_lines, failure_lines + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--threshold", type=float, default=0.3, help="Minimum ratio of annotated lines required (0.0 - 1.0)" + ) + parser.add_argument("--target-file", type=str, default=None, help="Path to the Python source file to analyse") + parser.add_argument( + "--all-lines", + action="store_true", + help="Check all lines in the file instead of only changed lines based on git", + ) + parser.add_argument("--debug", action="store_true", help="Add debugging logs") + args = parser.parse_args() + + total_changed = 0 + total_annotated = 0 + all_warnings: list[tuple[Path, int, str]] = [] + all_failures: list[tuple[Path, int, str]] = [] + + target_files = [args.target_file] if args.target_file is not None else get_changed_files() + for fpath in target_files: + if "tests/" in str(fpath): + continue + if args.all_lines: + changed_lines = [i + 1 for i in range(len(open(fpath).readlines()))] + else: + changed_lines = get_changed_lines(fpath) + annotated, total, warning_lines, failure_lines = check_file(fpath, changed_lines, args.debug) + total_annotated += annotated + total_changed += total + all_warnings.extend(warning_lines) + all_failures.extend(failure_lines) + + ratio = (total_annotated / total_changed) if total_changed else 1.0 + + print( + f"🔍 Type coverage on {'all' if args.all_lines else 'changed'} lines: " + f"{total_annotated}/{total_changed} = {ratio:.2%}. Files inspected: {target_files}" + ) + + if all_warnings: + print("\n⚠️ Suggest Improve: Lines missing type annotations for inputs and outputs:\n") + for fname, lineno, line in all_warnings: + print(f"{fname}:{lineno}: {line}") + + if all_failures: + print("⚠️ [ERROR] Lines missing type annotations for inputs and outputs:\n") + for fname, lineno, line in all_failures: + print(f"{fname}:{lineno}: {line}") + + if ratio < args.threshold: + print( + f"Please add type annotations for inputs and outputs to meet threshold {args.threshold}. " + f"Cases exempt from checking:" + ) + print("1. Private methods.") + print("2. Args with name in ('self', 'cls'), or *args / **kwargs") + print("3. Files under tests/") + raise Exception(f"\n❌ Type coverage below threshold ({args.threshold:.0%}).") + else: + if all_warnings or all_failures: + print("") + print("✅ Type annotation coverage acceptable.\n") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/validate_imported_docs.py b/code/RL_model/verl/verl_train/tests/special_sanity/validate_imported_docs.py new file mode 100644 index 0000000000000000000000000000000000000000..b36a407be77a777cd72a4abf8ce4727d375eb548 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/validate_imported_docs.py @@ -0,0 +1,130 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +verify_imported_docs.py + +Assert that every function or class *explicitly imported* (via +`from import `) in a given Python file has a docstring. +""" + +from __future__ import annotations + +import argparse +import ast +import importlib +import inspect +import pathlib +import sys + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Verify that imported functions/classes have docstrings.") + p.add_argument( + "--target-file", + default="verl/trainer/ppo/ray_trainer.py", + help="Path to the Python source file to analyse (e.g. verl/trainer/ppo/ray_trainer.py)", + ) + p.add_argument( + "--allow-list", + default=["omegaconf.open_dict"], + help="a list of third_party dependencies that do not have proper docs :(", + ) + p.add_argument( + "--project-root", + default=".", + help="Directory to prepend to PYTHONPATH so local packages resolve (default: .)", + ) + p.add_argument( + "--quiet", + action="store_true", + help="Suppress success message (still prints errors).", + ) + return p.parse_args() + + +def _import_attr(module_name: str, attr_name: str): + """Import `module_name` then return `getattr(module, attr_name)`.""" + module = importlib.import_module(module_name) + return getattr(module, attr_name) + + +def _check_file(py_file: pathlib.Path, project_root: pathlib.Path, allow_list: list[str]) -> list[str]: + """Return a list of error strings (empty == success).""" + # Ensure local packages resolve + sys.path.insert(0, str(project_root.resolve())) + + tree = ast.parse(py_file.read_text(), filename=str(py_file)) + problems: list[str] = [] + + for node in ast.walk(tree): + if not isinstance(node, ast.ImportFrom): + continue + + # Relative imports (level > 0) get the leading dots stripped + module_name = "." * node.level + (node.module or "") + for alias in node.names: + if alias.name == "*": + problems.append( + f"{py_file}:{node.lineno} - wildcard import `from {module_name} import *` cannot be verified." + ) + continue + + imported_name = alias.name + + try: + obj = _import_attr(module_name, imported_name) + except Exception: # pragma: no cover – wide net for import quirks + pass + # For some reason the module cannot be imported, skip for now + # problems.append( + # f"{py_file}:{node.lineno} - could not resolve " + # f"`{imported_name}` from `{module_name}` ({exc})" + # ) + continue + + if f"{module_name}.{imported_name}" in allow_list: + continue + if inspect.isfunction(obj) or inspect.isclass(obj): + doc = inspect.getdoc(obj) + if not (doc and doc.strip()): + kind = "class" if inspect.isclass(obj) else "function" + problems.append( + f"{py_file}:{node.lineno} - {kind} `{module_name}.{imported_name}` is missing a docstring." + ) + + return problems + + +def main() -> None: + args = _parse_args() + target_path = pathlib.Path(args.target_file).resolve() + project_root = pathlib.Path(args.project_root).resolve() + + if not target_path.is_file(): + raise Exception(f"❌ Target file not found: {target_path}") + + errors = _check_file(target_path, project_root, args.allow_list) + + if errors: + print("Docstring verification failed:\n") + print("\n".join(f" • {e}" for e in errors)) + raise Exception("❌ Docstring verification failed.") + + if not args.quiet: + print(f"✅ All explicitly imported functions/classes in {target_path} have docstrings.") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/validate_structure.py b/code/RL_model/verl/verl_train/tests/special_sanity/validate_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..56136b206374ceff9c566aa1cd88d5be30f8c73b --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/special_sanity/validate_structure.py @@ -0,0 +1,122 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +""" +Validate that test file subfolders mirror the top-level package layout. + +Usage examples +-------------- + +# Typical run (defaults: impl_root=my_project, tests_root=tests) +python check_tests_structure.py + +# Custom layout and extra allowed folders +python check_tests_structure.py \ + --impl-root verl \ + --tests-root tests \ + --allow-dirs special_e2e special_sanity special_standalone special_distributed +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + + +def discover_allowed_modules(impl_root: Path, extra: list[str]) -> set[str]: + """Return the set of first-level directories that tests may live under.""" + allowed = {p.name for p in impl_root.iterdir() if p.is_dir()} + allowed.update(extra) + return allowed + + +def find_violations(tests_root: Path, allowed: set[str], allowed_files: list[str]) -> list[str]: + """Return a list of error strings for test files in the wrong place.""" + errors: list[str] = [] + for test_file in tests_root.rglob("test*.py"): + if str(test_file) in allowed_files: + continue + rel_parts = test_file.relative_to(tests_root).parts + if len(rel_parts) < 2: + errors.append(f"{test_file}: must be inside one of {sorted(allowed)} (not at tests root)") + continue + + first_folder = rel_parts[0] + if first_folder not in allowed: + errors.append( + f"{test_file}: subfolder '{first_folder}' under tests/ is not an allowed module. " + f"The valid ones are: {sorted(allowed)}" + ) + return errors + + +def main() -> None: + parser = argparse.ArgumentParser(description="Check that test files follow tests//… layout.") + parser.add_argument( + "--impl-root", + type=Path, + default="verl", + help="Implementation root (default: my_project)", + ) + parser.add_argument( + "--tests-root", + type=Path, + default="tests", + help="Root of test tree (default: tests)", + ) + parser.add_argument( + "--allow-dirs", + nargs="*", + default=["special_e2e", "special_sanity", "special_standalone", "special_distributed"], + help="Extra top-level test folders that are exempt from the rule", + ) + parser.add_argument( + "--allow-files", + nargs="*", + default=[ + "tests/test_protocol_on_cpu.py", + "tests/test_base_config_on_cpu.py", + "tests/test_protocol_v2_on_cpu.py", + ], + help="Extra top-level test folders that are exempt from the rule", + ) + args = parser.parse_args() + + if not args.impl_root.is_dir(): + raise Exception(f"Implementation root '{args.impl_root}' does not exist.") + if not args.tests_root.is_dir(): + raise Exception(f"Tests root '{args.tests_root}' does not exist.") + + allowed = discover_allowed_modules(args.impl_root, args.allow_dirs) + violations = find_violations(args.tests_root, allowed, args.allow_files) + + if violations: + print("❌ Test layout violations found:\n", file=sys.stderr) + for err in violations: + print(" -", err, file=sys.stderr) + + print( + f"\nGuideline:\n Place each test file under tests//…\n where is " + f"one of the top-level packages inside '{args.impl_root}', or is explicitly listed via --allow-dirs.\n", + file=sys.stderr, + ) + raise Exception("❌ Test layout violations found.") + + print("✅ Tests folder structure looks good.") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/trainer/__init__.py b/code/RL_model/verl/verl_train/tests/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6f79d474d156e16ae54bb3d0c8f9ae7d0e16946e --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/trainer/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests for the trainer module. +""" diff --git a/code/RL_model/verl/verl_train/tests/workers/actor/test_special_dp_actor.py b/code/RL_model/verl/verl_train/tests/workers/actor/test_special_dp_actor.py new file mode 100644 index 0000000000000000000000000000000000000000..a039fa6e43aff7a19c9a88de00f74239d183fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/actor/test_special_dp_actor.py @@ -0,0 +1,304 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import torch +import torch.nn as nn +from tensordict import TensorDict +from transformers import AutoModelForCausalLM, Qwen3Config + +from verl import DataProto +from verl.utils.device import get_device_name +from verl.workers.actor.dp_actor import DataParallelPPOActor +from verl.workers.config import FSDPActorConfig, OptimizerConfig + + +class MockTransformerModel(nn.Module): + """Mock transformer model for testing DataParallelPPOActor""" + + def __init__(self, vocab_size=1000, hidden_size=64): + super().__init__() + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.embedding = nn.Embedding(vocab_size, hidden_size) + self.transformer = nn.TransformerEncoder( + nn.TransformerEncoderLayer(d_model=hidden_size, nhead=4, batch_first=True), num_layers=2 + ) + self.lm_head = nn.Linear(hidden_size, vocab_size) + + def forward(self, input_ids, attention_mask=None, position_ids=None, use_cache=False, **kwargs): + batch_size, seq_len = input_ids.shape + + embeddings = self.embedding(input_ids) + hidden_states = self.transformer(embeddings) + logits = self.lm_head(hidden_states) + + class MockOutput: + def __init__(self, logits): + self.logits = logits + + return MockOutput(logits) + + +class TestDataParallelPPOActor(unittest.TestCase): + """Test DataParallelPPOActor compute_log_prob and update_policy methods""" + + @classmethod + def setUpClass(cls): + """Set up distributed environment""" + if get_device_name() == "cuda": + backend_name = "nccl" + elif get_device_name() == "npu": + backend_name = "hccl" + else: + backend_name = "gloo" + + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend=backend_name, init_method="env://") + + cls.rank = torch.distributed.get_rank() + cls.world_size = torch.distributed.get_world_size() + + if get_device_name() == "cuda": + torch.cuda.set_device(cls.rank) + cls.device = torch.device(f"cuda:{cls.rank}") + elif get_device_name() == "npu": + torch.npu.set_device(cls.rank) + cls.device = torch.device(f"npu:{cls.rank}") + else: + cls.device = torch.device("cpu") + + def setUp(self): + """Set up test fixtures""" + self.config = FSDPActorConfig( + strategy="fsdp2", + ppo_mini_batch_size=4, + ppo_micro_batch_size_per_gpu=2, + ppo_epochs=1, + clip_ratio=0.2, + entropy_coeff=0.01, + grad_clip=1.0, + use_dynamic_bsz=False, + use_torch_compile=False, # Disable torch.compile for testing + ulysses_sequence_parallel_size=1, + optim=OptimizerConfig(lr=1e-6), + rollout_n=1, + ) + + self.mock_model = MockTransformerModel(vocab_size=1000, hidden_size=64).to(self.device) + self.mock_optimizer = torch.optim.Adam(self.mock_model.parameters(), lr=1e-4) + + self.actor = DataParallelPPOActor( + config=self.config, actor_module=self.mock_model, actor_optimizer=self.mock_optimizer + ) + + @classmethod + def tearDownClass(cls): + """Clean up distributed environment""" + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + def _create_test_data_for_compute_log_prob(self): + """Create test DataProto for compute_log_prob method""" + batch_size = 2 + prompt_length = 8 + response_length = 4 + total_length = prompt_length + response_length + vocab_size = 1000 + + input_ids = torch.randint(0, vocab_size, (batch_size, total_length)).to(self.device) + attention_mask = torch.ones(batch_size, total_length).to(self.device) + position_ids = torch.arange(total_length).unsqueeze(0).expand(batch_size, -1).to(self.device) + responses = input_ids[:, -response_length:] # Last part is the response + + tensor_dict = TensorDict( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "responses": responses, + }, + batch_size=[batch_size], + ) + + meta_info = {"micro_batch_size": batch_size, "temperature": 1.0, "use_dynamic_bsz": False} + + return DataProto(batch=tensor_dict, meta_info=meta_info) + + def _create_test_data_for_update_policy(self): + """Create test DataProto for update_policy method""" + batch_size = 4 # Must match ppo_mini_batch_size + prompt_length = 8 + response_length = 4 + total_length = prompt_length + response_length + vocab_size = 1000 + + input_ids = torch.randint(0, vocab_size, (batch_size, total_length)).to(self.device) + attention_mask = torch.ones(batch_size, total_length).to(self.device) + position_ids = torch.arange(total_length).unsqueeze(0).expand(batch_size, -1).to(self.device) + responses = input_ids[:, -response_length:] + response_mask = torch.ones(batch_size, response_length).to(self.device) + old_log_probs = torch.randn(batch_size, response_length).to(self.device) * 0.1 # Small values + advantages = torch.randn(batch_size, response_length).to(self.device) * 0.5 + + tensor_dict = TensorDict( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "responses": responses, + "response_mask": response_mask, + "old_log_probs": old_log_probs, + "advantages": advantages, + }, + batch_size=[batch_size], + ) + + meta_info = {"temperature": 1.0} + + return DataProto(batch=tensor_dict, meta_info=meta_info) + + def test_compute_log_prob(self): + """Test compute_log_prob method""" + data = self._create_test_data_for_compute_log_prob() + + outputs = self.actor.compute_log_prob(data, calculate_entropy=True) + log_probs = outputs["log_probs"] + entropys = outputs["entropys"] + + batch_size = data.batch["responses"].shape[0] + response_length = data.batch["responses"].shape[1] + + self.assertIsInstance(log_probs, torch.Tensor) + self.assertEqual(log_probs.shape, (batch_size, response_length)) + self.assertTrue(torch.all(torch.isfinite(log_probs))) + + self.assertIsInstance(entropys, torch.Tensor) + self.assertEqual(entropys.shape, (batch_size, response_length)) + self.assertTrue(torch.all(torch.isfinite(entropys))) + self.assertTrue(torch.all(entropys >= 0)) # Entropy should be non-negative + + def test_compute_log_prob_without_entropy(self): + """Test compute_log_prob method without entropy calculation""" + data = self._create_test_data_for_compute_log_prob() + + outputs = self.actor.compute_log_prob(data, calculate_entropy=False) + log_probs = outputs["log_probs"] + entropys = outputs.get("entropys", None) + + batch_size = data.batch["responses"].shape[0] + response_length = data.batch["responses"].shape[1] + + self.assertIsInstance(log_probs, torch.Tensor) + self.assertEqual(log_probs.shape, (batch_size, response_length)) + self.assertTrue(torch.all(torch.isfinite(log_probs))) + self.assertIsNone(entropys) + + def test_update_policy(self): + """Test update_policy method""" + data = self._create_test_data_for_update_policy() + + metrics = self.actor.update_policy(data) + + self.assertIsInstance(metrics, dict) + + expected_metric_keys = [ + "actor/pg_loss", + "actor/pg_clipfrac", + "actor/ppo_kl", + "actor/pg_clipfrac_lower", + "actor/grad_norm", + ] + + for key in expected_metric_keys: + self.assertIn(key, metrics) + if isinstance(metrics[key], list): + self.assertTrue(all(torch.isfinite(torch.tensor(v)) for v in metrics[key])) + else: + self.assertIsInstance(metrics[key], (float, int)) + self.assertTrue(torch.isfinite(torch.tensor(metrics[key]))) + + def test_dataparallelppoactor_initialization(self): + """Test DataParallelPPOActor initialization""" + self.assertIsNotNone(self.actor.actor_module) + self.assertIsNotNone(self.actor.actor_optimizer) + self.assertEqual(self.actor.config, self.config) + + self.assertEqual(self.actor.config.strategy, "fsdp2") + self.assertEqual(self.actor.config.ppo_mini_batch_size, 4) + self.assertEqual(self.actor.config.clip_ratio, 0.2) + + def test_dataparallelppoactor_with_qwen3_model(self): + """Test DataParallelPPOActor with real Qwen3ForCausalLM model""" + qwen_config = Qwen3Config( + vocab_size=1000, + hidden_size=64, + intermediate_size=128, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + max_position_embeddings=512, + torch_dtype=torch.float32, + use_cache=False, + ) + + with torch.device(self.device): + qwen_model = AutoModelForCausalLM.from_config(config=qwen_config, torch_dtype=torch.float32).to(self.device) + + qwen_optimizer = torch.optim.Adam(qwen_model.parameters(), lr=1e-4) + + qwen_actor = DataParallelPPOActor(config=self.config, actor_module=qwen_model, actor_optimizer=qwen_optimizer) + + data = self._create_test_data_for_compute_log_prob() + outputs = qwen_actor.compute_log_prob(data, calculate_entropy=True) + log_probs = outputs["log_probs"] + entropys = outputs["entropys"] + + batch_size = data.batch["responses"].shape[0] + response_length = data.batch["responses"].shape[1] + + self.assertIsInstance(log_probs, torch.Tensor) + self.assertEqual(log_probs.shape, (batch_size, response_length)) + self.assertTrue(torch.all(torch.isfinite(log_probs))) + + self.assertIsInstance(entropys, torch.Tensor) + self.assertEqual(entropys.shape, (batch_size, response_length)) + self.assertTrue(torch.all(torch.isfinite(entropys))) + self.assertTrue(torch.all(entropys >= 0)) + + policy_data = self._create_test_data_for_update_policy() + metrics = qwen_actor.update_policy(policy_data) + + self.assertIsInstance(metrics, dict) + + expected_metric_keys = [ + "actor/pg_loss", + "actor/pg_clipfrac", + "actor/ppo_kl", + "actor/pg_clipfrac_lower", + "actor/grad_norm", + ] + + for key in expected_metric_keys: + self.assertIn(key, metrics) + if isinstance(metrics[key], list): + self.assertTrue(all(torch.isfinite(torch.tensor(v)) for v in metrics[key])) + else: + self.assertIsInstance(metrics[key], (float, int)) + self.assertTrue(torch.isfinite(torch.tensor(metrics[key]))) + + +if __name__ == "__main__": + unittest.main() diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_actor_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_actor_config_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..464746b56ccb710f487590c992ddcea70c998663 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/config/test_actor_config_on_cpu.py @@ -0,0 +1,256 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from verl.utils.config import omega_conf_to_dataclass +from verl.workers.config import ( + ActorConfig, + FSDPActorConfig, + McoreActorConfig, + OptimizerConfig, +) + + +class TestActorConfig(unittest.TestCase): + """Test the ActorConfig dataclass and its variants.""" + + def test_config_inheritance(self): + """Test that the inheritance hierarchy works correctly.""" + megatron_dict = { + "_target_": "verl.workers.config.McoreActorConfig", + "strategy": "megatron", + "ppo_mini_batch_size": 256, + "ppo_micro_batch_size_per_gpu": 256, + "clip_ratio": 0.2, + "optim": { + "_target_": "verl.workers.config.McoreOptimizerConfig", + "lr": 0.1, + }, + "rollout_n": 1, + } + fsdp_dict = { + "_target_": "verl.workers.config.FSDPActorConfig", + "strategy": "fsdp", + "ppo_mini_batch_size": 256, + "ppo_micro_batch_size_per_gpu": 256, + "clip_ratio": 0.2, + "optim": { + "_target_": "verl.workers.config.FSDPOptimizerConfig", + "lr": 0.1, + }, + "rollout_n": 1, + } + + megatron_config = omega_conf_to_dataclass(megatron_dict) + fsdp_config = omega_conf_to_dataclass(fsdp_dict) + + self.assertIsInstance(megatron_config, ActorConfig) + self.assertIsInstance(fsdp_config, ActorConfig) + + self.assertEqual(megatron_config.ppo_mini_batch_size, fsdp_config.ppo_mini_batch_size) + self.assertEqual(megatron_config.clip_ratio, fsdp_config.clip_ratio) + + def test_actor_config_from_yaml(self): + """Test creating ActorConfig from YAML file.""" + from hydra import compose, initialize_config_dir + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")): + cfg = compose(config_name="actor", overrides=["strategy=fsdp", "ppo_micro_batch_size_per_gpu=128"]) + + config = omega_conf_to_dataclass(cfg) + + self.assertIsInstance(config, ActorConfig) + self.assertEqual(config.strategy, "fsdp") + + def test_fsdp_actor_config_from_yaml(self): + """Test creating FSDPActorConfig from YAML file.""" + from hydra import compose, initialize_config_dir + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")): + cfg = compose(config_name="dp_actor", overrides=["strategy=fsdp2", "ppo_micro_batch_size_per_gpu=128"]) + + config = omega_conf_to_dataclass(cfg) + + self.assertIsInstance(config, FSDPActorConfig) + self.assertEqual(config.strategy, "fsdp2") + + def test_megatron_actor_config_from_yaml(self): + """Test creating McoreActorConfig from YAML file.""" + from hydra import compose, initialize_config_dir + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")): + cfg = compose(config_name="megatron_actor", overrides=["ppo_micro_batch_size_per_gpu=128"]) + + config = omega_conf_to_dataclass(cfg) + + self.assertIsInstance(config, McoreActorConfig) + self.assertEqual(config.strategy, "megatron") + + def test_config_get_method(self): + """Test the get method for backward compatibility.""" + config_dict = { + "_target_": "verl.workers.config.ActorConfig", + "strategy": "fsdp", + "ppo_mini_batch_size": 256, + "ppo_micro_batch_size_per_gpu": 256, + "optim": { + "_target_": "verl.workers.config.OptimizerConfig", + "lr": 0.1, + }, + "rollout_n": 1, + } + config = omega_conf_to_dataclass(config_dict) + + self.assertEqual(config.get("strategy"), "fsdp") + self.assertEqual(config.get("ppo_mini_batch_size"), 256) + + self.assertIsNone(config.get("non_existing")) + self.assertEqual(config.get("non_existing", "default"), "default") + + def test_config_dict_like_access(self): + """Test dictionary-like access to config fields.""" + config_dict = { + "_target_": "verl.workers.config.ActorConfig", + "strategy": "fsdp", + "ppo_mini_batch_size": 256, + "ppo_micro_batch_size_per_gpu": 256, + "optim": { + "_target_": "verl.workers.config.OptimizerConfig", + "lr": 0.1, + }, + "rollout_n": 1, + } + config = omega_conf_to_dataclass(config_dict) + + self.assertEqual(config["strategy"], "fsdp") + self.assertEqual(config["ppo_mini_batch_size"], 256) + + field_names = list(config) + self.assertIn("strategy", field_names) + self.assertIn("ppo_mini_batch_size", field_names) + + self.assertGreater(len(config), 0) + + def test_frozen_fields_modification_raises_exception(self): + """Test that modifying frozen fields raises an exception.""" + config_dict = { + "_target_": "verl.workers.config.ActorConfig", + "strategy": "fsdp", + "ppo_mini_batch_size": 256, + "ppo_micro_batch_size_per_gpu": 256, + "optim": { + "_target_": "verl.workers.config.OptimizerConfig", + "lr": 0.1, + }, + "rollout_n": 1, + } + config = omega_conf_to_dataclass(config_dict) + + with self.assertRaises(AttributeError): + config.strategy = "megatron" + + with self.assertRaises(AttributeError): + config.clip_ratio = 0.5 + + config.ppo_mini_batch_size = 512 # This should work since it's not in frozen fields anymore + self.assertEqual(config.ppo_mini_batch_size, 512) + + def test_actor_config_validation_exceptions(self): + """Test that ActorConfig.__post_init__ raises appropriate validation exceptions.""" + optim = OptimizerConfig(lr=0.1) + with self.assertRaises((ValueError, AssertionError)) as cm: + ActorConfig( + strategy="fsdp", + loss_agg_mode="invalid-mode", + use_dynamic_bsz=True, + optim=optim, + ppo_micro_batch_size_per_gpu=4, + rollout_n=1, + ) + self.assertIn("Invalid loss_agg_mode", str(cm.exception)) + + with self.assertRaises((ValueError, AssertionError)) as cm: + ActorConfig( + strategy="fsdp", + use_dynamic_bsz=False, + ppo_micro_batch_size=4, + ppo_micro_batch_size_per_gpu=2, + optim=optim, + rollout_n=1, + ) + self.assertIn("You have set both", str(cm.exception)) + + with self.assertRaises((ValueError, AssertionError)) as cm: + ActorConfig( + strategy="fsdp", + use_dynamic_bsz=False, + ppo_micro_batch_size=None, + ppo_micro_batch_size_per_gpu=None, + optim=optim, + rollout_n=1, + ) + self.assertIn("Please set at least one", str(cm.exception)) + + config = ActorConfig( + strategy="fsdp", + use_dynamic_bsz=True, + ppo_micro_batch_size=None, + ppo_micro_batch_size_per_gpu=None, + optim=optim, + rollout_n=1, + ) + self.assertIsNotNone(config) # Should not raise an exception + + def test_fsdp_actor_config_validation_exceptions(self): + """Test that FSDPActorConfig.validate() raises appropriate validation exceptions.""" + optim = OptimizerConfig(lr=0.1) + config = FSDPActorConfig( + strategy="fsdp", + ulysses_sequence_parallel_size=2, + use_dynamic_bsz=True, # Skip batch size validation to focus on FSDP validation + optim=optim, + rollout_n=1, + ) + + model_config = {"use_remove_padding": False} + with self.assertRaises(ValueError) as cm: + config.validate(n_gpus=8, train_batch_size=256, model_config=model_config) + self.assertIn("you must enable `use_remove_padding`", str(cm.exception)) + + def test_actor_config_validate_method_exceptions(self): + """Test that ActorConfig.validate() raises appropriate validation exceptions.""" + optim = OptimizerConfig(lr=0.1) + config = ActorConfig( + strategy="fsdp", + use_dynamic_bsz=False, + ppo_mini_batch_size=256, + ppo_micro_batch_size=8, + ppo_micro_batch_size_per_gpu=None, # Ensure only one batch size setting is used + optim=optim, + rollout_n=1, + ) + + with self.assertRaises(ValueError) as cm: + config.validate(n_gpus=8, train_batch_size=128) + self.assertIn("train_batch_size", str(cm.exception)) + + with self.assertRaises(ValueError) as cm: + config.validate(n_gpus=16, train_batch_size=512) + self.assertIn("must be >= n_gpus", str(cm.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_critic_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_critic_config_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..fb03560e0f491c3243ce9384b48821110c720fa5 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/config/test_critic_config_on_cpu.py @@ -0,0 +1,305 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path + +import pytest +from hydra import compose, initialize_config_dir + +from verl.utils.config import omega_conf_to_dataclass +from verl.utils.profiler import ProfilerConfig +from verl.workers.config import ( + CriticConfig, + FSDPCriticConfig, + FSDPOptimizerConfig, + McoreCriticConfig, + McoreOptimizerConfig, + OptimizerConfig, +) + + +@pytest.mark.skip(reason="This test is flaky when we actively load model config") +class TestCriticConfig: + """Test suite for critic configuration dataclasses.""" + + @pytest.fixture + def config_dir(self): + """Get the path to the config directory.""" + return Path(__file__).parent.parent.parent.parent / "verl" / "trainer" / "config" / "critic" + + def test_megatron_critic_config_instantiation_from_yaml(self, config_dir): + """Test that McoreCriticConfig can be instantiated from megatron_critic.yaml.""" + yaml_path = config_dir / "megatron_critic.yaml" + assert yaml_path.exists(), f"Config file not found: {yaml_path}" + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/critic")): + test_config = compose(config_name="megatron_critic", overrides=["ppo_micro_batch_size_per_gpu=1"]) + + megatron_config_obj = omega_conf_to_dataclass(test_config) + + assert isinstance(megatron_config_obj, McoreCriticConfig) + assert isinstance(megatron_config_obj, CriticConfig) + + expected_attrs = [ + "strategy", + "rollout_n", + "optim", + "model", + "ppo_mini_batch_size", + "ppo_max_token_len_per_gpu", + "cliprange_value", + "get", + "nccl_timeout", + "megatron", + "load_weight", + ] + for attr in expected_attrs: + assert hasattr(megatron_config_obj, attr), f"Missing attribute: {attr}" + + assert callable(megatron_config_obj.get) + assert megatron_config_obj.strategy == "megatron" + + def test_fsdp_critic_config_instantiation_from_yaml(self, config_dir): + """Test that FSDPCriticConfig can be instantiated from dp_critic.yaml.""" + yaml_path = config_dir / "dp_critic.yaml" + assert yaml_path.exists(), f"Config file not found: {yaml_path}" + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/critic")): + test_config = compose(config_name="dp_critic", overrides=["ppo_micro_batch_size_per_gpu=1"]) + + fsdp_config_obj = omega_conf_to_dataclass(test_config) + + assert isinstance(fsdp_config_obj, FSDPCriticConfig) + assert isinstance(fsdp_config_obj, CriticConfig) + + expected_attrs = [ + "strategy", + "rollout_n", + "optim", + "model", + "ppo_mini_batch_size", + "ppo_max_token_len_per_gpu", + "cliprange_value", + "get", + "forward_micro_batch_size", + "forward_micro_batch_size_per_gpu", + "ulysses_sequence_parallel_size", + "grad_clip", + ] + for attr in expected_attrs: + assert hasattr(fsdp_config_obj, attr), f"Missing attribute: {attr}" + + assert callable(fsdp_config_obj.get) + assert fsdp_config_obj.strategy == "fsdp" + + def test_config_inheritance_hierarchy(self): + """Test that the inheritance hierarchy is correct.""" + megatron_config = McoreCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=McoreOptimizerConfig(lr=0.1)) + assert isinstance(megatron_config, CriticConfig) + assert isinstance(megatron_config, McoreCriticConfig) + + fsdp_config = FSDPCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=FSDPOptimizerConfig(lr=0.1)) + assert isinstance(fsdp_config, CriticConfig) + assert isinstance(fsdp_config, FSDPCriticConfig) + + critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=OptimizerConfig(lr=0.1)) + assert isinstance(critic_config, CriticConfig) + assert not isinstance(critic_config, McoreCriticConfig) + assert not isinstance(critic_config, FSDPCriticConfig) + + def test_config_dict_interface(self): + """Test that configs provide dict-like interface from BaseConfig.""" + optim = OptimizerConfig(lr=0.1) + config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim) + + assert "strategy" in config + assert config["strategy"] == "fsdp2" + + assert config.get("strategy") == "fsdp2" + assert config.get("nonexistent_key", "default") == "default" + + keys = list(config) + assert "strategy" in keys + assert "rollout_n" in keys + + assert len(config) > 0 + + def test_frozen_fields_immutability(self): + """Test that frozen fields raise exceptions when modified after creation.""" + critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=OptimizerConfig(lr=0.1)) + frozen_fields = ["rollout_n", "strategy", "cliprange_value"] + + for field_name in frozen_fields: + with pytest.raises((AttributeError, TypeError, ValueError)): + setattr(critic_config, field_name, "modified_value") + + megatron_config = McoreCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=McoreOptimizerConfig(lr=0.1)) + megatron_frozen_fields = ["nccl_timeout", "load_weight", "data_loader_seed"] + + for field_name in megatron_frozen_fields: + with pytest.raises((AttributeError, TypeError, ValueError)): + setattr(megatron_config, field_name, "modified_value") + + fsdp_config = FSDPCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=FSDPOptimizerConfig(lr=0.1)) + fsdp_frozen_fields = ["ulysses_sequence_parallel_size", "grad_clip"] + + for field_name in fsdp_frozen_fields: + with pytest.raises((AttributeError, TypeError, ValueError)): + setattr(fsdp_config, field_name, "modified_value") + + def test_batch_size_fields_modifiable(self): + """Test that batch size fields can be modified after creation.""" + optim = OptimizerConfig(lr=0.1) + critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim) + + critic_config.ppo_mini_batch_size = 8 + critic_config.ppo_micro_batch_size = 4 + critic_config.ppo_micro_batch_size_per_gpu = 2 + + assert critic_config.ppo_mini_batch_size == 8 + assert critic_config.ppo_micro_batch_size == 4 + assert critic_config.ppo_micro_batch_size_per_gpu == 2 + + fsdp_config = FSDPCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=FSDPOptimizerConfig(lr=0.1)) + + fsdp_config.forward_micro_batch_size = 16 + fsdp_config.forward_micro_batch_size_per_gpu = 8 + + assert fsdp_config.forward_micro_batch_size == 16 + assert fsdp_config.forward_micro_batch_size_per_gpu == 8 + + def test_profiler_config_type_validation(self): + """Test that profiler field has correct type and validation.""" + optim = OptimizerConfig(lr=0.1) + critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim) + assert isinstance(critic_config.profiler, ProfilerConfig) + assert critic_config.profiler.all_ranks is False + assert critic_config.profiler.ranks == [] + + custom_profiler = ProfilerConfig(all_ranks=True, ranks=[0, 1]) + critic_config_custom = CriticConfig( + profiler=custom_profiler, ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim + ) + assert isinstance(critic_config_custom.profiler, ProfilerConfig) + assert critic_config_custom.profiler.all_ranks is True + assert critic_config_custom.profiler.ranks == [0, 1] + + profiler1 = ProfilerConfig(enable=True, ranks=[0, 1]) + profiler2 = ProfilerConfig(all_ranks=True, ranks=[1, 2]) + + union_result = profiler1.union(profiler2) + assert union_result.enable is True + assert union_result.all_ranks is True + assert set(union_result.ranks) == {0, 1, 2} + + intersect_result = profiler1.intersect(profiler2) + assert intersect_result.all_ranks is False + assert intersect_result.ranks == [1] + + def test_critic_config_validation_logic(self): + """Test the __post_init__ validation logic for CriticConfig.""" + optim = OptimizerConfig(lr=0.1) + valid_config = CriticConfig( + strategy="fsdp2", ppo_micro_batch_size_per_gpu=2, use_dynamic_bsz=False, optim=optim + ) + assert valid_config.ppo_micro_batch_size_per_gpu == 2 + + valid_config2 = CriticConfig( + strategy="fsdp2", + ppo_micro_batch_size_per_gpu=None, + ppo_micro_batch_size=4, + ppo_mini_batch_size=8, + use_dynamic_bsz=False, + optim=optim, + ) + assert valid_config2.ppo_micro_batch_size == 4 + + dynamic_config = CriticConfig( + strategy="fsdp2", ppo_micro_batch_size_per_gpu=2, use_dynamic_bsz=True, optim=optim + ) + assert dynamic_config.use_dynamic_bsz is True + + with pytest.raises(ValueError, match="You have set both.*micro_batch_size.*AND.*micro_batch_size_per_gpu"): + CriticConfig( + strategy="fsdp2", + ppo_micro_batch_size=4, + ppo_micro_batch_size_per_gpu=2, + use_dynamic_bsz=False, + optim=optim, + ) + + with pytest.raises( + ValueError, match="Please set at least one of.*micro_batch_size.*or.*micro_batch_size_per_gpu" + ): + CriticConfig( + strategy="fsdp2", + ppo_micro_batch_size=None, + ppo_micro_batch_size_per_gpu=None, + use_dynamic_bsz=False, + optim=optim, + ) + + def test_micro_batch_size_divisibility_validation(self): + """Test micro batch size divisibility validation in __post_init__.""" + optim = OptimizerConfig(lr=0.1) + valid_config = CriticConfig( + strategy="fsdp2", ppo_micro_batch_size_per_gpu=2, ppo_mini_batch_size=8, use_dynamic_bsz=False, optim=optim + ) + assert valid_config.ppo_mini_batch_size == 8 + assert valid_config.ppo_micro_batch_size_per_gpu == 2 + + valid_config_with_mbs = CriticConfig( + strategy="fsdp2", ppo_mini_batch_size=8, ppo_micro_batch_size=4, use_dynamic_bsz=False, optim=optim + ) + assert valid_config_with_mbs.ppo_mini_batch_size == 8 + assert valid_config_with_mbs.ppo_micro_batch_size == 4 + + with pytest.raises(ValueError, match="ppo_mini_batch_size.*must be divisible by.*ppo_micro_batch_size"): + CriticConfig( + strategy="fsdp2", ppo_mini_batch_size=7, ppo_micro_batch_size=4, use_dynamic_bsz=False, optim=optim + ) + + dynamic_config = CriticConfig( + strategy="fsdp2", ppo_mini_batch_size=7, ppo_micro_batch_size=4, use_dynamic_bsz=True, optim=optim + ) + assert dynamic_config.use_dynamic_bsz is True + + def test_fsdp_sequence_parallelism_validation(self): + """Test FSDP sequence parallelism validation in FSDPCriticConfig.__post_init__.""" + valid_config = FSDPCriticConfig( + ppo_micro_batch_size_per_gpu=2, + ulysses_sequence_parallel_size=2, + model={"use_remove_padding": True}, + optim=FSDPOptimizerConfig(lr=0.1), + ) + assert valid_config.ulysses_sequence_parallel_size == 2 + + with pytest.raises( + ValueError, match="When using sequence parallelism for critic, you must enable.*use_remove_padding" + ): + FSDPCriticConfig( + ppo_micro_batch_size_per_gpu=2, + ulysses_sequence_parallel_size=2, + model={"use_remove_padding": False}, + optim=FSDPOptimizerConfig(lr=0.1), + ) + + valid_config_no_sp = FSDPCriticConfig( + ppo_micro_batch_size_per_gpu=2, + ulysses_sequence_parallel_size=1, + model={"use_remove_padding": False}, + optim=FSDPOptimizerConfig(lr=0.1), + ) + assert valid_config_no_sp.ulysses_sequence_parallel_size == 1 diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_engine_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_engine_config_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..1253f5c9ab9943df3c187a3c8458b35f78fe6994 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/config/test_engine_config_on_cpu.py @@ -0,0 +1,67 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from verl.workers.config.engine import FSDPEngineConfig, McoreEngineConfig + + +class TestMcoreEngineConfig: + def test_default_values(self): + config = McoreEngineConfig() + assert config.tensor_model_parallel_size == 1 + assert config.sequence_parallel is False # Should be auto-corrected + assert config.seed == 42 + + def test_post_init_validation(self): + # Test TP size 1 forces sequence_parallel=False + config = McoreEngineConfig(tensor_model_parallel_size=1) + assert config.sequence_parallel is False + + # Test TP >1 keeps sequence_parallel=True + config = McoreEngineConfig(tensor_model_parallel_size=2) + assert config.sequence_parallel is True + + def test_mutable_fields(self): + config = McoreEngineConfig() + config.sequence_parallel = True # Should be mutable + with pytest.raises(AttributeError): + config.tensor_model_parallel_size = 2 # Frozen field + + @pytest.mark.parametrize("offload_field", ["param_offload", "grad_offload", "optimizer_offload"]) + def test_offload_flags(self, offload_field): + config = McoreEngineConfig(**{offload_field: True}) + assert getattr(config, offload_field) is True + + +class TestFSDPEngineConfigCPU: + def test_default_values(self): + config = FSDPEngineConfig() + assert config.param_offload is False + assert config.optimizer_offload is False + assert config.fsdp_size == -1 + + @pytest.mark.parametrize( + "offload_params", + [{"param_offload": True}, {"optimizer_offload": True}, {"param_offload": True, "optimizer_offload": True}], + ) + def test_offload_combinations(self, offload_params): + config = FSDPEngineConfig(**offload_params) + assert config.param_offload == offload_params.get("param_offload", False) + assert config.optimizer_offload == offload_params.get("optimizer_offload", False) + + def test_wrap_policy_configuration(self): + test_policy = {"layer_class": "TransformerBlock"} + config = FSDPEngineConfig(wrap_policy=test_policy) + assert config.wrap_policy == test_policy diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_optim_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_optim_config_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..b44cb40c6b1dceca7da61af2bcebeb20d0fb9b58 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/config/test_optim_config_on_cpu.py @@ -0,0 +1,48 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from verl.workers.config.optimizer import FSDPOptimizerConfig + + +class TestFSDPOptimizerConfigCPU: + def test_default_configuration(self): + config = FSDPOptimizerConfig(lr=0.1) + assert config.min_lr_ratio is None + assert config.lr_scheduler_type == "constant" + assert config.num_cycles == 0.5 + + @pytest.mark.parametrize("lr_scheduler_type", ["constant", "cosine"]) + def test_valid_lr_scheduler_types(self, lr_scheduler_type): + config = FSDPOptimizerConfig(lr_scheduler_type=lr_scheduler_type, lr=0.1) + assert config.lr_scheduler_type == lr_scheduler_type + + @pytest.mark.parametrize("warmup_style", ["constant", "cosine"]) + def test_valid_warmup_style_types(self, warmup_style): + config = FSDPOptimizerConfig(warmup_style=warmup_style, lr=0.1) + assert config.lr_scheduler_type == warmup_style + + def test_invalid_lr_scheduler_type(self): + with pytest.raises((ValueError, AssertionError)): + FSDPOptimizerConfig(lr_scheduler_type="invalid_style", lr=0.1) + + def test_invalid_warmup_style_type(self): + with pytest.raises((ValueError, AssertionError)): + FSDPOptimizerConfig(warmup_style="invalid_style", lr=0.1) + + @pytest.mark.parametrize("num_cycles", [0.1, 1.0, 2.5]) + def test_num_cycles_configuration(self, num_cycles): + config = FSDPOptimizerConfig(num_cycles=num_cycles, lr=0.1) + assert config.num_cycles == num_cycles diff --git a/code/RL_model/verl/verl_train/tests/workers/critic/test_special_dp_critic.py b/code/RL_model/verl/verl_train/tests/workers/critic/test_special_dp_critic.py new file mode 100644 index 0000000000000000000000000000000000000000..d6eaa10cf17ffa10a686c9530d8c291f73c98fcb --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/critic/test_special_dp_critic.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile +import unittest +from unittest.mock import Mock, patch + +import torch +import torch.distributed +from omegaconf import OmegaConf +from tensordict import TensorDict +from transformers import AutoConfig + +from verl import DataProto +from verl.workers.config import FSDPCriticConfig, FSDPOptimizerConfig +from verl.workers.config.critic import FSDPCriticModelCfg +from verl.workers.config.engine import FSDPEngineConfig +from verl.workers.fsdp_workers import CriticWorker + + +class TestCriticWorker(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Set up distributed environment""" + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group( + backend="nccl" if torch.cuda.is_available() else "gloo", init_method="env://" + ) + + cls.rank = torch.distributed.get_rank() + cls.world_size = torch.distributed.get_world_size() + + if torch.cuda.is_available(): + torch.cuda.set_device(cls.rank) + cls.device = torch.device(f"cuda:{cls.rank}") + else: + cls.device = torch.device("cpu") + + @classmethod + def tearDownClass(cls): + """Clean up distributed environment""" + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + def setUp(self): + """Set up test fixtures""" + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.temp_dir = tempfile.mkdtemp() + + model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct") + config = AutoConfig.from_pretrained(model_path) + config.save_pretrained(self.temp_dir) + + self.config = FSDPCriticConfig( + strategy="fsdp2", + ppo_mini_batch_size=4, + ppo_micro_batch_size_per_gpu=2, + forward_micro_batch_size_per_gpu=2, + ppo_epochs=1, + cliprange_value=0.5, + grad_clip=1.0, + use_dynamic_bsz=False, + ulysses_sequence_parallel_size=1, + rollout_n=1, + optim=FSDPOptimizerConfig(lr=1e-6), + model=FSDPCriticModelCfg( + path=model_path, + tokenizer_path=model_path, + fsdp_config=FSDPEngineConfig(fsdp_size=-1), + use_remove_padding=False, + ), + ) + assert self.world_size <= 4 // 2 + + def tearDown(self): + """Clean up test fixtures""" + import shutil + + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def _create_test_data_for_compute_values(self, batch_size=2, seq_len=10, response_len=5): + """Create test data for compute_values method""" + input_ids = torch.randint(0, 1000, (batch_size, seq_len), dtype=torch.long) + attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long) + position_ids = torch.arange(seq_len).unsqueeze(0).expand(batch_size, -1) + responses = torch.randint(0, 1000, (batch_size, response_len), dtype=torch.long) + response_mask = torch.ones(batch_size, response_len, dtype=torch.float) + + batch = TensorDict( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "responses": responses, + "response_mask": response_mask, + }, + batch_size=[batch_size], + ) + + data = DataProto( + batch=batch, meta_info={"micro_batch_size": 2, "max_token_len": seq_len, "use_dynamic_bsz": False} + ) + + return data + + def _create_test_data_for_update_critic(self, batch_size=2, seq_len=10, response_len=5): + """Create test data for update_critic method""" + input_ids = torch.randint(0, 1000, (batch_size, seq_len), dtype=torch.long) + attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long) + position_ids = torch.arange(seq_len).unsqueeze(0).expand(batch_size, -1) + responses = torch.randint(0, 1000, (batch_size, response_len), dtype=torch.long) + response_mask = torch.ones(batch_size, response_len, dtype=torch.float) + values = torch.randn(batch_size, response_len, dtype=torch.float) + returns = torch.randn(batch_size, response_len, dtype=torch.float) + + batch = TensorDict( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "responses": responses, + "response_mask": response_mask, + "values": values, + "returns": returns, + }, + batch_size=[batch_size], + ) + + data = DataProto( + batch=batch, + meta_info={"global_token_num": [response_len] * batch_size, "batch_seqlens": [response_len] * batch_size}, + ) + + return data + + def test_init_model(self): + """Test CriticWorker.init_model() method""" + worker = CriticWorker(self.config) + worker.init_model() + + self.assertIsNotNone(worker.critic_module) + self.assertIsNotNone(worker.critic_optimizer) + self.assertIsNotNone(worker.critic) + self.assertIsNotNone(worker.checkpoint_manager) + + def test_compute_values(self): + """Test CriticWorker.compute_values() method""" + worker = CriticWorker(self.config) + worker.init_model() + + data = self._create_test_data_for_compute_values() + + result = worker.compute_values(data) + + self.assertIsInstance(result, DataProto) + self.assertIn("values", result.batch) + values = result.batch["values"] + + batch_size, response_len = 2, 5 + self.assertEqual(values.shape, (batch_size, response_len)) + + self.assertTrue(torch.isfinite(values).all()) + + def test_update_critic(self): + """Test CriticWorker.update_critic() method""" + worker = CriticWorker(self.config) + worker.init_model() + + data = self._create_test_data_for_update_critic() + + result = worker.update_critic(data) + + self.assertIsInstance(result, DataProto) + self.assertIn("metrics", result.meta_info) + metrics = result.meta_info["metrics"] + + expected_keys = ["critic/vf_loss", "critic/vf_clipfrac", "critic/vpred_mean", "critic/grad_norm"] + for key in expected_keys: + self.assertIn(key, metrics) + + for key, value in metrics.items(): + if isinstance(value, list | tuple): + for v in value: + self.assertTrue(torch.isfinite(torch.tensor(v)).all()) + else: + self.assertTrue(torch.isfinite(torch.tensor(value)).all()) + + @patch("transformers.AutoConfig.from_pretrained") + def test_critic_attn_implementation_override_functionality(self, mock_config_from_pretrained): + """Test that CriticWorker correctly uses attn_implementation from override_config""" + + # Mock the AutoConfig return value + mock_config = Mock() + mock_config.tie_word_embeddings = False + mock_config.architectures = ["LlamaForCausalLM"] + mock_config.num_labels = 1 + mock_config_from_pretrained.return_value = mock_config + + # Test different attn_implementation values + test_cases = [ + ("eager", "eager"), + ("sdpa", "sdpa"), + ("flash_attention_2", "flash_attention_2"), + (None, "flash_attention_2"), # Default case + ] + + for override_value, expected_value in test_cases: + mock_config_from_pretrained.reset_mock() + + # Create config with override_config + config_dict = { + "model": { + "path": "/test/model/path", + "tokenizer_path": "/test/tokenizer/path", + "fsdp_config": { + "fsdp_size": 1, + "param_offload": False, + "optimizer_offload": False, + }, + }, + "optim": {"lr": 1e-4, "type": "AdamW"}, + "strategy": "fsdp", + "ppo_mini_batch_size": 1, + "ppo_epochs": 1, + "rollout_n": 1, + "checkpoint": {"save_contents": [], "load_contents": []}, + } + + # Add override_config with attn_implementation if specified + if override_value is not None: + config_dict["model"]["override_config"] = {"attn_implementation": override_value} + + # Convert to OmegaConf + test_config = OmegaConf.create(config_dict) + + # Test the extraction logic that should happen in CriticWorker._build_critic_model_optimizer + override_config = OmegaConf.to_container(OmegaConf.create(test_config.model.get("override_config", {}))) + extracted_attn_implementation = override_config.get("attn_implementation", "flash_attention_2") + + # Verify the extraction works correctly + self.assertEqual( + extracted_attn_implementation, + expected_value, + f"Expected {expected_value}, got {extracted_attn_implementation} for override_value {override_value}", + ) + + def test_critic_model_config_structure(self): + """Test that critic model config properly incorporates override settings""" + + # Test configuration scenarios + test_scenarios = [ + {"name": "default_flash_attention", "override_config": {}, "expected_attn": "flash_attention_2"}, + {"name": "eager_override", "override_config": {"attn_implementation": "eager"}, "expected_attn": "eager"}, + {"name": "sdpa_override", "override_config": {"attn_implementation": "sdpa"}, "expected_attn": "sdpa"}, + { + "name": "mixed_config", + "override_config": {"attn_implementation": "eager", "dropout": 0.1, "num_labels": 1}, + "expected_attn": "eager", + }, + ] + + for scenario in test_scenarios: + with self.subTest(scenario=scenario["name"]): + # Simulate the config processing logic from CriticWorker + override_config = scenario["override_config"] + + # Test the extraction logic + extracted_attn = override_config.get("attn_implementation", "flash_attention_2") + + # Verify correct extraction + self.assertEqual(extracted_attn, scenario["expected_attn"], f"Failed for scenario {scenario['name']}") + + # Verify other configs are preserved + if "dropout" in override_config: + self.assertEqual(override_config["dropout"], 0.1) + + def test_critic_hydra_config_compatibility(self): + """Test that Hydra +prefix configurations work correctly for CriticWorker""" + + # Simulate Hydra configuration with +prefix for critic + # This would come from: +critic.model.override_config.attn_implementation=eager + hydra_config_dict = { + "critic": {"model": {"path": "/test/model/path", "override_config": {"attn_implementation": "eager"}}} + } + + omegaconf = OmegaConf.create(hydra_config_dict) + + # Extract override config as would be done in CriticWorker + override_model_config = OmegaConf.to_container( + OmegaConf.create(omegaconf.critic.model.get("override_config", {})) + ) + + # Test extraction + attn_implementation = override_model_config.get("attn_implementation", "flash_attention_2") + self.assertEqual(attn_implementation, "eager") + + def test_critic_backward_compatibility(self): + """Test that CriticWorker maintains backward compatibility with existing configurations""" + + # Test cases for backward compatibility + compatibility_tests = [ + {"name": "no_override_config", "config": {}, "expected": "flash_attention_2"}, + {"name": "empty_override_config", "config": {"override_config": {}}, "expected": "flash_attention_2"}, + { + "name": "other_overrides_only", + "config": {"override_config": {"dropout": 0.1, "hidden_size": 768}}, + "expected": "flash_attention_2", + }, + ] + + for test in compatibility_tests: + with self.subTest(test=test["name"]): + override_config = test["config"].get("override_config", {}) + attn_implementation = override_config.get("attn_implementation", "flash_attention_2") + + self.assertEqual( + attn_implementation, test["expected"], f"Backward compatibility failed for {test['name']}" + ) + + def test_critic_and_actor_independent_configuration(self): + """Test that critic and actor can have independent attention implementation configurations""" + + # Simulate a complete training configuration with both actor and critic + complete_config = { + "actor_rollout_ref": {"model": {"override_config": {"attn_implementation": "eager"}}}, + "critic": {"model": {"override_config": {"attn_implementation": "sdpa"}}}, + } + + omegaconf = OmegaConf.create(complete_config) + + # Extract actor config + actor_override = OmegaConf.to_container( + OmegaConf.create(omegaconf.actor_rollout_ref.model.get("override_config", {})) + ) + actor_attn = actor_override.get("attn_implementation", "flash_attention_2") + + # Extract critic config + critic_override = OmegaConf.to_container(OmegaConf.create(omegaconf.critic.model.get("override_config", {}))) + critic_attn = critic_override.get("attn_implementation", "flash_attention_2") + + # Verify independent configuration + self.assertEqual(actor_attn, "eager") + self.assertEqual(critic_attn, "sdpa") + self.assertNotEqual(actor_attn, critic_attn) # Ensure they are indeed different + + +if __name__ == "__main__": + unittest.main() diff --git a/code/RL_model/verl/verl_train/tests/workers/reward_manager/test_registry_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/reward_manager/test_registry_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..9932ae8917805e3c92bbc0e11abd398463e8e87a --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/reward_manager/test_registry_on_cpu.py @@ -0,0 +1,94 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +# Assuming REWARD_MANAGER_REGISTRY is defined somewhere in the module +from verl.workers.reward_manager.registry import REWARD_MANAGER_REGISTRY, get_reward_manager_cls, register + + +@pytest.fixture +def setup(): + """Setup test cases with a mock registry.""" + REWARD_MANAGER_REGISTRY.clear() + REWARD_MANAGER_REGISTRY.update({"manager1": "Manager1Class", "manager2": "Manager2Class"}) + return REWARD_MANAGER_REGISTRY + + +def test_get_existing_manager(setup): + """Test getting an existing reward manager class.""" + assert get_reward_manager_cls("manager1") == "Manager1Class" + assert get_reward_manager_cls("manager2") == "Manager2Class" + + +def test_get_nonexistent_manager(setup): + """Test getting a non-existent reward manager raises ValueError.""" + with pytest.raises(ValueError) as excinfo: + get_reward_manager_cls("unknown_manager") + assert "Unknown reward manager: unknown_manager" in str(excinfo.value) + + +def test_case_sensitivity(setup): + """Test that manager names are case-sensitive.""" + with pytest.raises(ValueError): + get_reward_manager_cls("MANAGER1") + with pytest.raises(ValueError): + get_reward_manager_cls("Manager1") + + +def test_empty_registry(setup): + """Test behavior when registry is empty.""" + REWARD_MANAGER_REGISTRY.clear() + with pytest.raises(ValueError) as excinfo: + get_reward_manager_cls("any_manager") + assert "Unknown reward manager: any_manager" in str(excinfo.value) + + +def test_register_new_class(setup): + """Test registering a new class with the decorator.""" + + @register("test_manager") + class TestManager: + pass + + assert "test_manager" in REWARD_MANAGER_REGISTRY + assert REWARD_MANAGER_REGISTRY["test_manager"] == TestManager + + +def test_register_different_classes_same_name(setup): + """Test that registering different classes with same name raises ValueError.""" + + @register("conflict_manager") + class Manager1: + pass + + with pytest.raises(ValueError): + + @register("conflict_manager") + class Manager2: + pass + + assert REWARD_MANAGER_REGISTRY["conflict_manager"] == Manager1 + + +def test_decorator_returns_original_class(setup): + """Test that the decorator returns the original class unchanged.""" + + @register("return_test") + class OriginalClass: + def method(setup): + return 42 + + assert OriginalClass().method() == 42 + assert REWARD_MANAGER_REGISTRY["return_test"] == OriginalClass diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/perf/vllm_async_rollout.py b/code/RL_model/verl/verl_train/tests/workers/rollout/perf/vllm_async_rollout.py new file mode 100644 index 0000000000000000000000000000000000000000..d7239ea88dd14f6b7fc4927388ff47273c02a34e --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/perf/vllm_async_rollout.py @@ -0,0 +1,138 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Compare vLLM AsyncLLM backend: ExternalRayDistributedExecutor(remote call) vs RayDistributedExecutor(compiled graph) + +1. Prepare openai/gsm8k dataset +python3 examples/data_preprocess/gsm8k.py + +2. Run perf test +python3 tests/workers/rollout/perf/vllm_async_rollout.py >perf.log 2>&1 + +hardware: Nvidia 8*H20 +packages: +- torch==2.6.0 +- vllm==0.8.5 + +[DEBUG] backend: sync, n_gpus_per_node: 8, batch_size: 2048, step: 0, step_time: 21.27 secs +[DEBUG] backend: zeromq, n_gpus_per_node: 8, batch_size: 2048, step: 0, step_time: 23.40 secs +[DEBUG] backend: ray, n_gpus_per_node: 8, batch_size: 2048, step: 0, step_time: 25.33 secs +""" + +import os +import time + +import ray +from omegaconf import DictConfig +from torch.utils.data import SequentialSampler +from torchdata.stateful_dataloader import StatefulDataLoader + +from tests.experimental.agent_loop.agent_utils import AgentLoopManager, RayWorkerGroup, init_agent_loop_manager +from verl.protocol import DataProto +from verl.utils import hf_tokenizer +from verl.utils.dataset import RLHFDataset +from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn + + +def init_config(n_gpus_per_node) -> DictConfig: + import os + + from hydra import compose, initialize_config_dir + + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): + config = compose( + config_name="ppo_trainer", + overrides=[ + "actor_rollout_ref.actor.use_dynamic_bsz=true", + "actor_rollout_ref.actor.fsdp_config.param_offload=True", + "actor_rollout_ref.actor.fsdp_config.optimizer_offload=True", + ], + ) + config.trainer.n_gpus_per_node = n_gpus_per_node + config.data.train_batch_size = 128 + config.data.return_raw_chat = True + config.actor_rollout_ref.model.path = "Qwen/Qwen2.5-7B-Instruct" + config.actor_rollout_ref.rollout.mode = "async" + config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2 + config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.9 + config.actor_rollout_ref.rollout.multi_turn.format = "hermes" + config.actor_rollout_ref.rollout.prompt_length = 4096 + config.actor_rollout_ref.rollout.response_length = 4096 + config.actor_rollout_ref.rollout.n = 16 + + return config + + +def initialize(config, backend) -> tuple[AgentLoopManager | RayWorkerGroup, StatefulDataLoader]: + env_vars = { + "NCCL_DEBUG": "WARN", + "VLLM_USE_V1": "1", + "VERL_VLLM_DISTRIBUTED_BACKEND": backend, + } + ray.init(runtime_env={"env_vars": env_vars}) + + # STEP 1: init async llm server + server = init_agent_loop_manager(config) + + # STEP 2: create dataloader + tokenizer = hf_tokenizer(config.actor_rollout_ref.model.path) + dataset = RLHFDataset( + data_files=os.path.expanduser("~/data/gsm8k/train.parquet"), + tokenizer=tokenizer, + config=config.data, + ) + dataloader = StatefulDataLoader( + dataset=dataset, + batch_size=config.data.get("gen_batch_size", config.data.train_batch_size), + num_workers=config.data.get("dataloader_num_workers", 8), + drop_last=True, + collate_fn=default_collate_fn, + sampler=SequentialSampler(dataset), + ) + + return server, dataloader + + +def perf_rollout(mode, backend, n_gpus_per_node, num_steps): + config = init_config(n_gpus_per_node) + config.actor_rollout_ref.rollout.mode = mode + agent_loop_manager, dataloader = initialize(config, backend) + + for step, batch in enumerate(dataloader): + batch: DataProto = DataProto.from_single_dict(batch) + batch = batch.pop( + batch_keys=["input_ids", "attention_mask", "position_ids"], + non_tensor_batch_keys=["raw_prompt_ids", "raw_prompt"], + ) + t_start = time.time() + gen_batch = agent_loop_manager.generate_sequences(batch) + t_end = time.time() + print( + f"[DEBUG] backend: {backend}, n_gpus_per_node: {n_gpus_per_node}, batch_size: {len(gen_batch)}, " + f"step: {step}, step_time: {t_end - t_start:.2f} secs" + ) + if step + 1 >= num_steps: + break + + ray.shutdown() + + +if __name__ == "__main__": + num_steps = 1 + n_gpus_per_node = 8 + + # test_cases = [("sync", "sync"), ("async", "zeromq"), ("async", "ray")] + test_cases = [("async", "zeromq"), ("async", "ray")] + for mode, backend in test_cases: + perf_rollout(mode=mode, backend=backend, n_gpus_per_node=n_gpus_per_node, num_steps=num_steps) diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config new file mode 100644 index 0000000000000000000000000000000000000000..aa3f1eec5af8477543a487bacd602ab0d2f7390b --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config @@ -0,0 +1,17 @@ +tools: + - class_name: "verl.tools.sandbox_fusion_tools.SandboxFusionTool" + config: + sandbox_fusion_url: "https://xxx.apigateway-cn-beijing.volceapi.com/run_code" + type: native + tool_schema: + type: "function" + function: + name: "code_interpreter" + description: "A tool for executing code." + parameters: + type: "object" + properties: + code: + type: "string" + description: "The code to execute." + required: ["code"] \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/search_tool_config b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/search_tool_config new file mode 100644 index 0000000000000000000000000000000000000000..926b6b832f283175f92cc86b6cc4a1964096a8d3 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/search_tool_config @@ -0,0 +1,23 @@ +tools: + - class_name: verl.tools.search_tool.SearchTool + config: + retrieval_service_url: http://127.0.0.1:8000/retrieve + num_workers: 120 + rate_limit: 120 + timeout: 30 + type: native + tool_schema: + type: function + function: + name: search + description: Searches the web for relevant information based on the given query. + parameters: + type: object + properties: + query_list: + type: array + item: + type: string + description: A list of fully-formed semantic queries. The tool will return search results for each query. + required: + - query_list \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_sglang/test_http_server_engine.py b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_sglang/test_http_server_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..e89607705fef92b7ea728cceee7275fa8054c1d0 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_sglang/test_http_server_engine.py @@ -0,0 +1,978 @@ +# Copyright 2025 z.ai +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is adapted from multiple sources: +# 1. THUDM/slime project +# Original source: https://github.com/THUDM/slime/blob/main/slime/backends/sglang_utils/http_server_engine.py +# Copyright 2025 z.ai +# Licensed under the Apache License, Version 2.0 +# 2. SGLang project +# Original source: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server_engine.py +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 +# +# Modifications made by z.ai and ModelBest Inc. include but are not limited to: +# - Enhanced error handling and retry logic +# - Added async support with connection pooling +# - Extended functionality for distributed weight updates +# - Improved logging and monitoring capabilities +# - Additional configuration options and optimizations + +"""Complete unit tests for HTTP Server Engine Adapters. + +This module contains comprehensive unit tests for both HttpServerEngineAdapter +and AsyncHttpServerEngineAdapter classes, covering all public methods, +error handling scenarios, edge cases, and boundary conditions using pytest and mock frameworks. + +Tests use real SGLang modules for integration testing while mocking external dependencies. +""" + +import asyncio +from unittest.mock import AsyncMock, Mock, patch + +import aiohttp +import pytest +import requests +from sglang.srt.managers.io_struct import ( + UpdateWeightsFromTensorReqInput, +) +from sglang.srt.utils import MultiprocessingSerializer + +# Import the module under test +from verl.workers.rollout.sglang_rollout.http_server_engine import ( + AsyncHttpServerAdapter, + HttpServerAdapter, + launch_server_process, +) + + +@pytest.fixture(scope="session") +def event_loop(): + """Create an event loop for the entire test session.""" + loop = asyncio.new_event_loop() + yield loop + loop.close() + + +@pytest.fixture +def basic_adapter_kwargs(): + """Provide basic kwargs for creating HTTP server adapters.""" + return { + "host": "localhost", + "port": 8000, + "node_rank": 0, + "model_path": "/tmp/test_model", + } + + +@pytest.fixture +def router_adapter_kwargs(): + """Provide kwargs for creating adapters with router configuration.""" + return { + "router_ip": "192.168.1.1", + "router_port": 8080, + "host": "localhost", + "port": 8000, + "node_rank": 0, + "model_path": "/tmp/test_model", + } + + +@pytest.fixture +def non_master_adapter_kwargs(): + """Provide kwargs for creating non-master node adapters.""" + return { + "host": "localhost", + "port": 8000, + "node_rank": 1, # Non-master + "model_path": "/tmp/test_model", + } + + +@pytest.fixture +def mock_launch_server_process(): + """Mock the launch_server_process function for testing without actual server startup.""" + from unittest.mock import patch + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.launch_server_process") as mock_launch: + mock_process = Mock() + mock_process.is_alive.return_value = True + mock_process.pid = 12345 + mock_launch.return_value = mock_process + yield mock_launch + + +@pytest.fixture +def mock_multiprocessing_process(): + """Create mock multiprocessing.Process for testing without actual process creation.""" + from unittest.mock import patch + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process") as mock_process_class: + mock_process = Mock() + mock_process.is_alive.return_value = True + mock_process.pid = 12345 + mock_process_class.return_value = mock_process + yield mock_process + + +@pytest.fixture +def mock_requests_session(): + """Create mock requests.Session for testing HTTP interactions.""" + from unittest.mock import patch + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.Session") as mock_session_class: + mock_session = Mock() + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "success"} + mock_session.get.return_value = mock_response + mock_session.post.return_value = mock_response + mock_session_class.return_value.__enter__.return_value = mock_session + yield mock_session + + +@pytest.fixture +def mock_requests_post(): + """Mock requests.post for testing HTTP POST requests.""" + from unittest.mock import patch + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "success"} + mock_post.return_value = mock_response + yield mock_post + + +@pytest.fixture +def mock_requests_get(): + """Mock requests.get for testing HTTP GET requests.""" + from unittest.mock import patch + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get: + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "success"} + mock_get.return_value = mock_response + yield mock_get + + +@pytest.fixture +def mock_aiohttp_session(): + """Create mock aiohttp.ClientSession for testing async HTTP interactions.""" + mock_session = AsyncMock() + mock_session.closed = False + + # Mock response + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value={"status": "success"}) + mock_response.raise_for_status = Mock() + + # Mock context managers + mock_session.get.return_value.__aenter__.return_value = mock_response + mock_session.post.return_value.__aenter__.return_value = mock_response + + return mock_session + + +@pytest.fixture +def mock_kill_process_tree(): + """Mock kill_process_tree function for testing cleanup without actual process termination.""" + from unittest.mock import patch + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.kill_process_tree") as mock_kill: + yield mock_kill + + +# Test environment fixtures for real SGLang testing +@pytest.fixture(scope="session") +def sglang_test_model_path(): + """Provide a test model path for SGLang tests. + + This can be overridden by environment variable SGLANG_TEST_MODEL_PATH + for tests that need a real model. + """ + import os + + return os.getenv("SGLANG_TEST_MODEL_PATH", "/tmp/test_model") + + +@pytest.fixture +def real_adapter_kwargs(sglang_test_model_path): + """Provide kwargs for creating adapters with real SGLang integration.""" + return { + "host": "localhost", + "port": 8000, + "node_rank": 0, + "model_path": sglang_test_model_path, + } + + +@pytest.fixture(autouse=True) +def mock_server_args_post_init(): + """Mock ServerArgs.__post_init__ to skip model path validation.""" + from unittest.mock import patch + + with patch( + "verl.workers.rollout.sglang_rollout.http_server_engine.ServerArgs.__post_init__", return_value=None + ) as mock_post_init: + yield mock_post_init + + +class TestLaunchServerProcess: + """Test cases for launch_server_process function.""" + + def test_launch_server_process_success( + self, mock_multiprocessing_process, mock_requests_session, real_adapter_kwargs + ): + """Test successful server process launch and health check.""" + # Import real SGLang ServerArgs + from sglang.srt.server_args import ServerArgs + + # Create server args using real ServerArgs + server_args = ServerArgs(**real_adapter_kwargs) + + # Test + with patch( + "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process" + ) as mock_process_class: + mock_process_class.return_value = mock_multiprocessing_process + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.Session") as mock_session_class: + mock_session_class.return_value.__enter__.return_value = mock_requests_session + + result = launch_server_process(server_args, first_rank_in_node=True) + + # Assertions + assert result == mock_multiprocessing_process + mock_multiprocessing_process.start.assert_called_once() + assert mock_requests_session.get.call_count >= 2 # health_generate and flush_cache + + def test_launch_server_process_non_master(self, mock_multiprocessing_process, non_master_adapter_kwargs): + """Test server launch for non-master nodes (should return immediately).""" + from sglang.srt.server_args import ServerArgs + + server_args = ServerArgs(**non_master_adapter_kwargs) + + with patch( + "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process" + ) as mock_process_class: + mock_process_class.return_value = mock_multiprocessing_process + result = launch_server_process(server_args, first_rank_in_node=True) + + assert result == mock_multiprocessing_process + mock_multiprocessing_process.start.assert_not_called() + + def test_launch_server_process_timeout(self, mock_multiprocessing_process, real_adapter_kwargs): + """Test timeout during server health check.""" + from sglang.srt.server_args import ServerArgs + + server_args = ServerArgs(**real_adapter_kwargs) + + with patch( + "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process" + ) as mock_process_class: + mock_process_class.return_value = mock_multiprocessing_process + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.Session") as mock_session_class: + mock_session = Mock() + mock_session.get.side_effect = requests.RequestException("Connection failed") + mock_session_class.return_value.__enter__.return_value = mock_session + + import itertools + + with patch( + "verl.workers.rollout.sglang_rollout.http_server_engine.time.time", + side_effect=itertools.chain([0], itertools.repeat(400)), # 第一次返回0,之后一直返回400 + ): + with pytest.raises(TimeoutError): + launch_server_process(server_args, first_rank_in_node=True) + + mock_multiprocessing_process.terminate.assert_called_once() + + def test_launch_server_process_died(self, real_adapter_kwargs): + """Test server process dies during startup.""" + from sglang.srt.server_args import ServerArgs + + server_args = ServerArgs(**real_adapter_kwargs) + + with patch( + "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process" + ) as mock_process_class: + mock_process = Mock() + mock_process.is_alive.return_value = False + mock_process_class.return_value = mock_process + + with pytest.raises(RuntimeError, match="Server process terminated unexpectedly"): + launch_server_process(server_args, first_rank_in_node=True) + + +class TestHttpServerEngineAdapter: + """Test cases for HttpServerEngineAdapter class.""" + + def test_init_with_router_registration(self, mock_launch_server_process, mock_requests_post, router_adapter_kwargs): + """Test initialization with router registration.""" + adapter = HttpServerAdapter(**router_adapter_kwargs) + + assert adapter.router_ip == "192.168.1.1" + assert adapter.router_port == 8080 + assert adapter.process == mock_launch_server_process.return_value + mock_requests_post.assert_called_once() + + def test_init_without_router(self, mock_launch_server_process, basic_adapter_kwargs): + """Test initialization without router registration.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + assert adapter.router_ip is None + assert adapter.router_port is None + assert adapter.process == mock_launch_server_process.return_value + + def test_register_with_router_failure(self, mock_launch_server_process, router_adapter_kwargs): + """Test router registration failure handling.""" + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + mock_post.side_effect = requests.RequestException("Connection failed") + + # Should not raise exception, just log error + adapter = HttpServerAdapter(**router_adapter_kwargs) + + assert adapter.router_ip == "192.168.1.1" + mock_post.assert_called_once() + + def test_make_request_success(self, mock_launch_server_process, basic_adapter_kwargs): + """Test successful HTTP request.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "success"} + mock_post.return_value = mock_response + + result = adapter._make_request("test_endpoint", {"param": "value"}) + + assert result == {"status": "success"} + mock_post.assert_called_with( + "http://localhost:8000/test_endpoint", + json={"param": "value"}, + timeout=adapter.timeout, + ) + + def test_make_request_get_method(self, mock_launch_server_process, basic_adapter_kwargs): + """Test HTTP GET request.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get: + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": "test"} + mock_get.return_value = mock_response + + result = adapter._make_request("test_endpoint", method="GET") + + assert result == {"data": "test"} + mock_get.assert_called_with("http://localhost:8000/test_endpoint", timeout=adapter.timeout) + + def test_make_request_non_master(self, mock_launch_server_process): + """Test request from non-master node returns empty dict.""" + kwargs = {"host": "localhost", "port": 8000, "node_rank": 1, "model_path": "/tmp/test_model"} + adapter = HttpServerAdapter(**kwargs) + result = adapter._make_request("test_endpoint") + + assert result == {} + + def test_make_request_retry_logic(self, mock_launch_server_process, basic_adapter_kwargs): + """Test retry logic for failed requests.""" + adapter = HttpServerAdapter(max_attempts=3, **basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + with patch("time.sleep") as mock_sleep: + # First two calls fail, third succeeds + mock_post.side_effect = [ + requests.exceptions.Timeout(), + requests.exceptions.ConnectionError(), + Mock(status_code=200, json=lambda: {"success": True}), + ] + + result = adapter._make_request("test_endpoint") + + assert result == {"success": True} + assert mock_post.call_count == 3 + assert mock_sleep.call_count == 2 + + def test_make_request_http_error(self, mock_launch_server_process, basic_adapter_kwargs): + """Test HTTP error handling.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + mock_response = Mock() + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("404 Not Found") + mock_post.return_value = mock_response + + with pytest.raises(requests.exceptions.HTTPError): + adapter._make_request("test_endpoint") + + def test_make_request_max_attempts_exceeded(self, mock_launch_server_process, basic_adapter_kwargs): + """Test max retries exceeded.""" + adapter = HttpServerAdapter(max_attempts=1, **basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + with patch("time.sleep"): + mock_post.side_effect = requests.exceptions.Timeout() + + with pytest.raises(RuntimeError, match="Failed to complete request"): + adapter._make_request("test_endpoint") + + assert mock_post.call_count == 1 # Initial retry + + def test_update_weights_from_tensor_strict(self, mock_launch_server_process, basic_adapter_kwargs): + import base64 + + from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput + + from verl.workers.rollout.sglang_rollout.http_server_engine import HttpServerAdapter + + basic_adapter_kwargs.setdefault("node_rank", 0) + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "updated"} + + req = UpdateWeightsFromTensorReqInput( + serialized_named_tensors=[b"tensor1", b"tensor2"], + load_format="safetensors", + flush_cache=True, + ) + result = adapter.update_weights_from_tensor(req) + + assert result == {"status": "updated"} + + expected_b64_1 = base64.b64encode(b"tensor1").decode("utf-8") + expected_b64_2 = base64.b64encode(b"tensor2").decode("utf-8") + + mock_request.assert_called_once_with( + "update_weights_from_tensor", + { + "serialized_named_tensors": [expected_b64_1, expected_b64_2], + "load_format": "safetensors", + "flush_cache": True, + }, + ) + + def test_update_weights_from_tensor_empty(self, mock_launch_server_process, basic_adapter_kwargs): + from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput + + from verl.workers.rollout.sglang_rollout.http_server_engine import HttpServerAdapter + + basic_adapter_kwargs.setdefault("node_rank", 0) + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "updated"} + + req = UpdateWeightsFromTensorReqInput( + serialized_named_tensors=[], + load_format="safetensors", + flush_cache=True, + ) + result = adapter.update_weights_from_tensor(req) + + assert result == {"status": "updated"} + + mock_request.assert_called_once_with( + "update_weights_from_tensor", + { + "serialized_named_tensors": [], + "load_format": "safetensors", + "flush_cache": True, + }, + ) + + def test_update_weights_from_tensor_none(self, mock_launch_server_process, basic_adapter_kwargs): + from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput + + from verl.workers.rollout.sglang_rollout.http_server_engine import HttpServerAdapter + + basic_adapter_kwargs.setdefault("node_rank", 0) + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "updated"} + + req = UpdateWeightsFromTensorReqInput( + serialized_named_tensors=None, + load_format="safetensors", + flush_cache=True, + ) + result = adapter.update_weights_from_tensor(req) + + assert result == {"status": "updated"} + + mock_request.assert_called_once_with( + "update_weights_from_tensor", + { + "serialized_named_tensors": [], + "load_format": "safetensors", + "flush_cache": True, + }, + ) + + def test_generate(self, mock_launch_server_process, basic_adapter_kwargs): + """Test generate method.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"text": "Generated text"} + + result = adapter.generate( + prompt="Hello world", + sampling_params={"temperature": 0.7}, + return_logprob=True, + ) + + assert result == {"text": "Generated text"} + mock_request.assert_called_once_with( + "generate", + { + "text": "Hello world", + "sampling_params": {"temperature": 0.7}, + "return_logprob": True, + }, + only_master=False, + ) + + def test_flush_cache(self, mock_launch_server_process, basic_adapter_kwargs): + """Test flush_cache method.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get: + with patch("time.sleep") as mock_sleep: + # First call fails, second succeeds + mock_responses = [ + Mock(status_code=503), # Service unavailable + Mock(status_code=200, json=lambda: {"cache_flushed": True}), + ] + mock_get.side_effect = mock_responses + + result = adapter.flush_cache() + + assert result == {"cache_flushed": True} + assert mock_get.call_count == 2 + mock_sleep.assert_called_once() + + def test_flush_cache_non_master(self, mock_launch_server_process): + """Test flush_cache for non-master node.""" + kwargs = {"host": "localhost", "port": 8000, "node_rank": 1, "model_path": "/tmp/test_model"} + adapter = HttpServerAdapter(**kwargs) + result = adapter.flush_cache() + + assert result == {} + + def test_memory_management_methods(self, mock_launch_server_process, basic_adapter_kwargs): + """Test memory release and resume methods.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "success"} + + # Test release_memory_occupation + result = adapter.release_memory_occupation(["weights", "kv_cache"]) + assert result == {"status": "success"} + mock_request.assert_called_with("release_memory_occupation", {"tags": ["weights", "kv_cache"]}) + + # Test resume_memory_occupation + result = adapter.resume_memory_occupation(["weights"]) + assert result == {"status": "success"} + mock_request.assert_called_with("resume_memory_occupation", {"tags": ["weights"]}) + + def test_generation_control_methods(self, mock_launch_server_process, basic_adapter_kwargs): + """Test generation control methods.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "success"} + + def test_shutdown(self, mock_launch_server_process, mock_kill_process_tree, router_adapter_kwargs): + """Test shutdown method.""" + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + mock_response = Mock() + mock_response.status_code = 200 + mock_post.return_value = mock_response + + adapter = HttpServerAdapter(**router_adapter_kwargs) + + adapter.shutdown() + + # Should unregister from router + assert mock_post.call_count == 2 # Once for registration, once for unregistration + # Should kill process + mock_kill_process_tree.assert_called_once_with(mock_launch_server_process.return_value.pid) + + def test_shutdown_with_errors(self, mock_launch_server_process, mock_kill_process_tree, router_adapter_kwargs): + """Test shutdown method with errors.""" + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + # Mock registration success but unregistration failure + mock_post.side_effect = [ + Mock(status_code=200), # Registration success + requests.RequestException("Unregistration failed"), # Unregistration failure + ] + + # Mock process kill failure + mock_kill_process_tree.side_effect = Exception("Kill failed") + + adapter = HttpServerAdapter(**router_adapter_kwargs) + + # Should not raise exceptions + adapter.shutdown() + + assert mock_post.call_count == 2 + mock_kill_process_tree.assert_called_once_with(mock_launch_server_process.return_value.pid) + + # Edge cases for HttpServerEngineAdapter + def test_empty_and_none_parameters(self, mock_launch_server_process, basic_adapter_kwargs): + """Test handling of empty and None parameters.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "success"} + req = UpdateWeightsFromTensorReqInput( + serialized_named_tensors=None, + load_format=None, + flush_cache=None, + ) + + # Test generate with all None parameters + result = adapter.generate() + assert result == {"status": "success"} + + # Test with empty lists + result = adapter.update_weights_from_tensor(req) + assert result == {"status": "success"} + + # Test with empty tags + result = adapter.release_memory_occupation(req) + assert result == {"status": "success"} + + def test_large_payload_handling(self, mock_launch_server_process, basic_adapter_kwargs): + """Test handling of large payloads.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "success"} + + # Test with large tensor list + large_tensor_list = [MultiprocessingSerializer.serialize(f"tensor_{i}") for i in range(1000)] + + req = UpdateWeightsFromTensorReqInput( + serialized_named_tensors=large_tensor_list, + load_format="safetensors", + flush_cache=True, + ) + result = adapter.update_weights_from_tensor(req) + assert result == {"status": "success"} + + # Test with large prompt + large_prompt = "A" * 10000 + result = adapter.generate(prompt=large_prompt) + assert result == {"status": "success"} + + def test_timeout_edge_cases(self, mock_launch_server_process): + """Test various timeout scenarios.""" + # Test with very small timeout + kwargs = {"host": "localhost", "port": 8000, "node_rank": 0, "model_path": "/tmp/test_model", "timeout": 0.001} + adapter = HttpServerAdapter(**kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + mock_post.side_effect = requests.exceptions.Timeout() + + with pytest.raises(RuntimeError, match="Failed to complete request"): + adapter._make_request("test_endpoint") + + def test_extreme_configuration_values(self, mock_launch_server_process): + """Test extreme configuration values.""" + # Test with extreme values + kwargs = { + "host": "localhost", + "port": 8000, + "node_rank": 0, + "model_path": "/tmp/test_model", + "timeout": 0.001, # Very small + "max_attempts": 100, # Very large + "retry_delay": 0.001, # Very small + } + adapter = HttpServerAdapter(**kwargs) + + assert adapter.timeout == 0.001 + assert adapter.max_attempts == 100 + assert adapter.retry_delay == 0.001 + + +class TestAsyncHttpServerEngineAdapter: + """Test cases for AsyncHttpServerEngineAdapter class.""" + + def test_init(self, mock_launch_server_process, basic_adapter_kwargs): + """Test async adapter initialization.""" + adapter = AsyncHttpServerAdapter(max_connections=50, **basic_adapter_kwargs) + + assert adapter.max_connections == 50 + + @pytest.mark.asyncio + async def test_make_async_request_success(self, mock_launch_server_process, basic_adapter_kwargs): + """Test successful async HTTP request.""" + + # Instantiate adapter + adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs) + + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value={"status": "success"}) + mock_response.raise_for_status = Mock() + + mock_post_context_manager = AsyncMock() + mock_post_context_manager.__aenter__.return_value = mock_response + + mock_session = AsyncMock(spec=aiohttp.ClientSession) + mock_session.closed = False + mock_session.post.return_value = mock_post_context_manager + + mock_session_cm = AsyncMock() + mock_session_cm.__aenter__.return_value = mock_session + + with patch.object(adapter, "_get_session", return_value=mock_session_cm): + result = await adapter._make_async_request("test_endpoint", {"param": "value"}) + + # Assert result is correct + assert result == {"status": "success"} + + # Verify post was called + mock_session.post.assert_called_once_with( + "http://localhost:8000/test_endpoint", json={"param": "value"}, timeout=adapter.timeout + ) + + @pytest.mark.asyncio + async def test_make_async_request_get_method(self, mock_launch_server_process, basic_adapter_kwargs): + """Test async GET request using aiohttp and proper context mocking.""" + + # Instantiate the async adapter + adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs) + + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value={"data": "test"}) + mock_response.raise_for_status = Mock() + + mock_get_context_manager = AsyncMock() + mock_get_context_manager.__aenter__.return_value = mock_response + + mock_session = AsyncMock(spec=aiohttp.ClientSession) + mock_session.closed = False + mock_session.get.return_value = mock_get_context_manager + + mock_session_cm = AsyncMock() + mock_session_cm.__aenter__.return_value = mock_session + + with patch.object(adapter, "_get_session", return_value=mock_session_cm): + result = await adapter._make_async_request("test_endpoint", method="GET") + + # Validate + assert result == {"data": "test"} + mock_session.get.assert_called_once_with("http://localhost:8000/test_endpoint", timeout=adapter.timeout) + + @pytest.mark.asyncio + async def test_make_async_request_non_master(self, mock_launch_server_process): + """Test async request from non-master node.""" + kwargs = {"host": "localhost", "port": 8000, "node_rank": 1, "model_path": "/tmp/test_model"} + adapter = AsyncHttpServerAdapter(**kwargs) + result = await adapter._make_async_request("test_endpoint") + + assert result == {} + + @pytest.mark.asyncio + async def test_async_generate(self, mock_launch_server_process, basic_adapter_kwargs): + """Test async generate method.""" + adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_async_request", new_callable=AsyncMock) as mock_request: + mock_request.return_value = {"text": "Generated text"} + + result = await adapter.generate( + prompt="Hello world", + sampling_params={"temperature": 0.7}, + return_logprob=True, + ) + + assert result == {"text": "Generated text"} + mock_request.assert_called_once() + + @pytest.mark.asyncio + async def test_async_memory_management(self, mock_launch_server_process, basic_adapter_kwargs): + """Test async memory management methods.""" + adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_async_request", new_callable=AsyncMock) as mock_request: + mock_request.return_value = {"status": "success"} + + # Test release_memory_occupation + result = await adapter.release_memory_occupation(["weights"]) + assert result == {"status": "success"} + mock_request.assert_called_with("release_memory_occupation", {"tags": ["weights"]}) + + # Test resume_memory_occupation + result = await adapter.resume_memory_occupation(["weights"]) + assert result == {"status": "success"} + mock_request.assert_called_with("resume_memory_occupation", {"tags": ["weights"]}) + assert ( + mock_request.call_count == 2 + ) # resume memory occupation will also call release memory occupation once + + +class TestErrorRecovery: + """Test error recovery mechanisms.""" + + def test_flush_cache_recovery(self, mock_launch_server_process, basic_adapter_kwargs): + """Test flush cache recovery from failures.""" + adapter = HttpServerAdapter(max_attempts=2, **basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get: + # Simulate multiple failures then success + mock_get.side_effect = [ + requests.exceptions.ConnectionError(), + requests.exceptions.Timeout(), + Mock(status_code=503), # Service unavailable + Mock(status_code=200, json=lambda: {"cache_flushed": True}), + ] + + with patch("time.sleep"): + result = adapter.flush_cache() + assert result == {"cache_flushed": True} + + def test_flush_cache_max_attempts(self, mock_launch_server_process, basic_adapter_kwargs): + """Test flush cache max retries exceeded.""" + adapter = HttpServerAdapter(max_attempts=1, **basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get: + # All attempts fail + mock_get.side_effect = requests.exceptions.ConnectionError() + + with patch("time.sleep"): + result = adapter.flush_cache() + assert result == {} # Should return empty dict on failure + + def test_network_partition_recovery(self, mock_launch_server_process, basic_adapter_kwargs): + """Test recovery from network partition scenarios.""" + adapter = HttpServerAdapter(max_attempts=3, **basic_adapter_kwargs) + + with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post: + # Simulate network partition then recovery + mock_post.side_effect = [ + requests.exceptions.ConnectionError("Network unreachable"), + requests.exceptions.ConnectionError("Network unreachable"), + Mock(status_code=200, json=lambda: {"recovered": True}), + ] + + with patch("time.sleep"): + result = adapter._make_request("test_endpoint") + assert result == {"recovered": True} + + +class TestResourceManagement: + """Test resource management and cleanup.""" + + def test_resource_cleanup_on_exception( + self, mock_launch_server_process, mock_kill_process_tree, basic_adapter_kwargs + ): + """Test resource cleanup when exceptions occur.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + # Simulate exception during operation + with patch.object(adapter, "_make_request", side_effect=Exception("Test error")): + try: + adapter.generate(prompt="test") + except Exception: + pass + + # Cleanup should still work + adapter.shutdown() + mock_kill_process_tree.assert_called_once_with(mock_launch_server_process.return_value.pid) + + def test_multiple_shutdown_calls(self, mock_launch_server_process, basic_adapter_kwargs): + """Test multiple shutdown calls are safe.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + # Multiple shutdown calls should be safe + adapter.shutdown() + adapter.shutdown() + adapter.shutdown() + + +class TestDataTypeHandling: + """Test handling of various data types.""" + + def test_complex_data_structures(self, mock_launch_server_process, basic_adapter_kwargs): + """Test handling of complex data structures.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {"status": "success"} + + # Test with complex sampling params + complex_sampling_params = { + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "repetition_penalty": 1.1, + "stop_sequences": ["", "\n\n"], + "max_tokens": 100, + "logit_bias": {"token_123": 0.5, "token_456": -0.5}, + "nested_config": { + "beam_search": True, + "num_beams": 4, + "early_stopping": True, + }, + } + + result = adapter.generate( + prompt="Test prompt", + sampling_params=complex_sampling_params, + ) + + assert result == {"status": "success"} + # Verify the complex structure was passed through + call_args = mock_request.call_args[0][1] + assert call_args["sampling_params"] == complex_sampling_params + + +class TestIntegration: + """Integration tests for both adapters.""" + + def test_error_scenarios(self, mock_launch_server_process, basic_adapter_kwargs): + """Test various error scenarios.""" + adapter = HttpServerAdapter(**basic_adapter_kwargs) + + # Test with None payload + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {} + result = adapter.generate() + assert result == {} + + # Test with empty parameters + with patch.object(adapter, "_make_request") as mock_request: + mock_request.return_value = {} + req = UpdateWeightsFromTensorReqInput( + serialized_named_tensors=None, + load_format=None, + flush_cache=None, + ) + result = adapter.update_weights_from_tensor(req) + assert result == {} diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py new file mode 100644 index 0000000000000000000000000000000000000000..b924521705305f9c53d1b7eef0d3d70d017b2df9 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py @@ -0,0 +1,166 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import torch +import torch.distributed as dist +from torch.distributed.fsdp import CPUOffload, MixedPrecision +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.api import ShardedStateDictConfig, ShardingStrategy, StateDictType +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from vllm import SamplingParams + +from verl.third_party.vllm import LLM +from verl.utils.distributed import initialize_global_process_group + + +def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor) -> list[int]: + """Remove left padding tokens before feeding prompts to vLLM.""" + non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0] + return prompt_token_ids[non_pad_index:].tolist() + + +def main(): + assert torch.cuda.is_available(), "CUDA must be present to run FSDP vLLM example" + local_rank, rank, world_size = initialize_global_process_group() + + local_cache_path = "~/.cache/verl/rlhf" + local_cache_path = os.path.expanduser(local_cache_path) + hdfs_path = "Qwen/Qwen2-7B-Instruct" + + from verl.utils.fs import copy_to_local + + local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path) + tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True) + actor_model_config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=True) + with torch.device("cuda"): + actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True) + actor_model.to(torch.bfloat16) + + max_prompt_length = 16 + response_length = 32 + preencode_prompts = [ + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + tokenizer.pad_token = tokenizer.eos_token + prompts = tokenizer(preencode_prompts, return_tensors="pt", padding=True) + input_ids = prompts["input_ids"] + attention_mask = prompts["attention_mask"] + from verl.utils.torch_functional import pad_sequence_to_length + + input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True).cuda() + attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True).cuda() + + from transformers import GenerationConfig + + generation_config = GenerationConfig(do_sample=False) + actor_model.cuda() + output = actor_model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=32, + # max_length=max_length, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + generation_config=generation_config, + # renormalize_logits=True, + output_scores=False, # this is potentially very large + return_dict_in_generate=True, + use_cache=False, + ) # may OOM when use_cache = True + seq = output.sequences + response = seq[:, max_prompt_length:] + + print(f"hf response: {tokenizer.batch_decode(response)}") + + tensor_model_parallel_size = 4 + from torch.distributed.device_mesh import init_device_mesh + + device_mesh = init_device_mesh("cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]) + + mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32) + fsdp_model = FSDP( + actor_model, + use_orig_params=True, + auto_wrap_policy=None, + device_id=torch.cuda.current_device(), + sharding_strategy=ShardingStrategy.FULL_SHARD, + mixed_precision=mixed_precision, + cpu_offload=CPUOffload(offload_params=False), + sync_module_states=False, + device_mesh=device_mesh, + ) + + FSDP.set_state_dict_type( + fsdp_model, state_dict_type=StateDictType.SHARDED_STATE_DICT, state_dict_config=ShardedStateDictConfig() + ) + + state_dict = fsdp_model.state_dict() + + sampling_params = SamplingParams( + temperature=0, top_p=1, n=1, max_tokens=response_length, logprobs=1, ignore_eos=True, detokenize=False + ) + + print(actor_model_config) + llm = LLM( + model=None, + tokenizer=tokenizer, + model_hf_config=actor_model_config, + tensor_parallel_size=tensor_model_parallel_size, + enforce_eager=True, + dtype="bfloat16", + load_format="dummy_dtensor", + gpu_memory_utilization=0.8, + trust_remote_code=True, + ) + + # Warmup iterations + for _ in range(10): + torch.cuda.synchronize() + llm.sync_model_weights(actor_weights=state_dict, load_format="dtensor") + torch.cuda.synchronize() + dist.barrier() + + start_time = time.time() + llm.sync_model_weights(actor_weights=state_dict, load_format="dtensor") + torch.cuda.synchronize() + dist.barrier() + end_time = time.time() + + # Calculate elapsed time + elapsed_time = end_time - start_time + print(f"Time taken: {elapsed_time:.6f} seconds") + + input_ids = input_ids.cuda() + attention_mask = attention_mask.cuda() + idx_list = [] + batch_size = input_ids.shape[0] + + pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id + for i in range(batch_size): + idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i])) + print("start generation") + outputs = llm.generate(prompt_token_ids=idx_list, sampling_params=sampling_params, use_tqdm=False) + vllm_output = outputs[0].cuda() + if torch.distributed.get_rank() == 0: + print(f"hf response: {tokenizer.batch_decode(response)}") + print(f"vllm response: {tokenizer.batch_decode(vllm_output)}") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/test_vllm_abort.py b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/test_vllm_abort.py new file mode 100644 index 0000000000000000000000000000000000000000..82034f1e9059b5c8d91e943e180d73af0f9e7d61 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/test_vllm_abort.py @@ -0,0 +1,217 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test vLLM abort functionality. + +Usage: + pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s + or + python tests/workers/rollout/rollout_vllm/test_vllm_abort.py +""" + +import asyncio +import os +import time +from uuid import uuid4 + + +def test_vllm_abort(): + # ==================== Configuration ==================== + MODEL_PATH = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") # /root/models/Qwen/Qwen2.5-1.5B-Instruct + GPUS_PER_NODE = 2 + TP_SIZE = 1 + ROLLOUT_NAME = "vllm" + ABORT_DELAY = 0.5 # seconds to wait before aborting + + print("=" * 60) + print("vLLM Abort Test") + print("=" * 60) + print(f"Model: {MODEL_PATH}") + print(f"GPUs: {GPUS_PER_NODE}, TP Size: {TP_SIZE}") + print(f"Abort Delay: {ABORT_DELAY}s") + print("=" * 60) + + # ==================== Initialize Ray ==================== + print("\n[1] Initializing Ray...") + import ray + + ray.init( + runtime_env={ + "env_vars": { + "TOKENIZERS_PARALLELISM": "true", + "NCCL_DEBUG": "WARN", + "VLLM_LOGGING_LEVEL": "INFO", + "VLLM_USE_V1": "1", + } + }, + ignore_reinit_error=True, + ) + + try: + # ==================== Create Config ==================== + print("\n[2] Creating config...") + from hydra import compose, initialize_config_dir + + config_dir = os.path.abspath("verl/verl/trainer/config") + if not os.path.exists(config_dir): + config_dir = os.path.abspath("verl/trainer/config") + + with initialize_config_dir(config_dir=config_dir, version_base=None): + config = compose(config_name="ppo_trainer") + + config.trainer.n_gpus_per_node = GPUS_PER_NODE + config.trainer.nnodes = 1 + config.actor_rollout_ref.model.path = MODEL_PATH + config.actor_rollout_ref.rollout.name = ROLLOUT_NAME + config.actor_rollout_ref.rollout.mode = "async" + config.actor_rollout_ref.rollout.tensor_model_parallel_size = TP_SIZE + config.actor_rollout_ref.rollout.prompt_length = 512 + config.actor_rollout_ref.rollout.response_length = 512 # Longer for abort test + + # ==================== Create Rollout Server ==================== + print("\n[3] Creating rollout server (this may take a while)...") + from verl.workers.rollout.replica import get_rollout_replica_class + + rollout_config = config.actor_rollout_ref.rollout + model_config = config.actor_rollout_ref.model + + rollout_server_class = get_rollout_replica_class(ROLLOUT_NAME) + server = rollout_server_class( + replica_rank=0, + config=rollout_config, + model_config=model_config, + gpus_per_node=GPUS_PER_NODE, + ) + + asyncio.run(server.init_standalone()) + server_handle = server._server_handle + print(f"Server address: {server._server_address}") + + # ==================== Load Tokenizer ==================== + print("\n[4] Loading tokenizer...") + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + + # ==================== Prepare Prompts ==================== + print("\n[5] Preparing prompts (to ensure generation takes time)...") + NUM_PROMPTS = 8 + prompts = [ + "Write a very long story about a brave knight and dragon.", + "Explain the history of the Roman Empire in great detail.", + "Describe quantum computing and its applications thoroughly.", + "Write an essay about climate change and its global effects.", + "Who won the Champions League in 2019?", + "Write a detailed analysis of Shakespeare's Hamlet.", + "Describe the process of photosynthesis in plants.", + "Write about the French Revolution and its consequences.", + ] + + all_prompt_ids = [] + for prompt in prompts[:NUM_PROMPTS]: + messages = [{"role": "user", "content": prompt}] + prompt_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + all_prompt_ids.append(prompt_ids) + print(f"Prepared {NUM_PROMPTS} prompts") + + # ==================== Start Generations and Abort ==================== + print("\n[6] Starting generations and then aborting...") + + sampling_params = { + "temperature": 1.0, + "top_p": 1.0, + "logprobs": False, + } + + # Start all generations concurrently + print(f"\n Starting {NUM_PROMPTS} generations...") + generate_refs = [] + for i, prompt_ids in enumerate(all_prompt_ids): + request_id = f"abort_test_{i}_{uuid4().hex[:8]}" + ref = server_handle.generate.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + image_data=None, + ) + generate_refs.append((i, request_id, ref)) + print(f" Started request {i}: {request_id}") + + # Wait before aborting + print(f"\n Waiting {ABORT_DELAY}s before abort...") + time.sleep(ABORT_DELAY) + + # Call abort + print(" Calling abort_all_requests...") + abort_start = time.perf_counter() + abort_result = ray.get(server_handle.abort_all_requests.remote()) + abort_time = time.perf_counter() - abort_start + + print(f" Abort took: {abort_time * 1000:.2f}ms") + print(f" Abort result: {abort_result}") + + # Wait for all generations to finish + print("\n Waiting for all generations to complete...") + outputs = [] + for i, request_id, ref in generate_refs: + try: + output = ray.get(ref, timeout=10.0) + outputs.append((i, request_id, output)) + except ray.exceptions.GetTimeoutError: + print(f" Request {i} timed out!") + outputs.append((i, request_id, None)) + + # ==================== Print Results ==================== + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + + aborted_count = 0 + completed_count = 0 + timeout_count = 0 + + for i, request_id, output in outputs: + if output is None: + timeout_count += 1 + print(f"[{i}] {request_id}: TIMEOUT") + elif output.stop_reason == "aborted": + aborted_count += 1 + print(f"[{i}] {request_id}: ABORTED ({len(output.token_ids)} tokens)") + print(f"Partial Output: {tokenizer.decode(output.token_ids)}") + else: + completed_count += 1 + print(f"[{i}] {request_id}: COMPLETED ({output.stop_reason}, {len(output.token_ids)} tokens)") + print(f"Full Output: {tokenizer.decode(output.token_ids)}") + + print(f"\nSummary: {aborted_count} aborted, {completed_count} completed, {timeout_count} timeout") + + print("\n" + "=" * 60) + print(f"Abort result: {abort_result}") + print("=" * 60) + print("Abort test completed!") + + # Assertions for pytest + assert timeout_count == 0, "No requests should timeout" + assert aborted_count + completed_count == NUM_PROMPTS, "All requests should finish" + assert "aborted_count" in abort_result, "Abort result should contain aborted_count" + assert abort_time < 1.0, "Abort should be fast (< 1 second)" + + finally: + print("\nShutting down Ray...") + ray.shutdown() + + +if __name__ == "__main__": + # Can still run as standalone script + test_vllm_abort() diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_hf_rollout.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_hf_rollout.py new file mode 100644 index 0000000000000000000000000000000000000000..3eb6f4bb2ff3f04a6127304828793151c7b24052 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_hf_rollout.py @@ -0,0 +1,180 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +from omegaconf import OmegaConf +from torch.distributed.fsdp import CPUOffload, MixedPrecision +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.api import ShardedStateDictConfig, ShardingStrategy, StateDictType +from transformers import AutoModelForCausalLM, AutoTokenizer + +from verl import DataProto +from verl.utils.distributed import initialize_global_process_group +from verl.utils.fs import copy_to_local +from verl.utils.model import compute_position_id_with_mask +from verl.workers.rollout.hf_rollout import HFRollout + +BASE_HF_ROLLOUT_CONFIG = { + "temperature": 1.0, + "top_k": -1, + "top_p": 1, + "prompt_length": 64, + "response_length": 64, + "do_sample": True, + "n": 1, + "val_kwargs": { + "top_k": -1, + "top_p": 1.0, + "temperature": 0, + "n": 1, + "do_sample": False, + }, +} + + +def prepare_input_dataproto(tokenizer, config, validate): + preencode_prompts = [ + [{"role": "user", "content": "Who won the Champions League in 2019?"}], + [{"role": "user", "content": "The founder of Apple is"}], + [{"role": "user", "content": "What's your name"}], + ] + formatted_prompts = [ + tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) + for conversation in preencode_prompts + ] + prompts = tokenizer(formatted_prompts, return_tensors="pt", padding="max_length", max_length=config.prompt_length) + input_dataproto = DataProto.from_dict( + { + "input_ids": prompts["input_ids"], + "attention_mask": prompts["attention_mask"], + "position_ids": compute_position_id_with_mask(prompts["attention_mask"]), + }, + meta_info={ + "bos_token_id": tokenizer.bos_token_id, + "eos_token_id": tokenizer.eos_token_id, + "pad_token_id": tokenizer.pad_token_id, + "validate": validate, + }, + ) + return input_dataproto + + +def prepare_fsdp_model(model, world_size): + from torch.distributed.device_mesh import init_device_mesh + + device_mesh = init_device_mesh("cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]) + + mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32) + + fsdp_model = FSDP( + model, + use_orig_params=True, + auto_wrap_policy=None, + device_id=torch.cuda.current_device(), + sharding_strategy=ShardingStrategy.FULL_SHARD, + mixed_precision=mixed_precision, + cpu_offload=CPUOffload(offload_params=False), + sync_module_states=False, + device_mesh=device_mesh, + ) + + FSDP.set_state_dict_type( + fsdp_model, state_dict_type=StateDictType.SHARDED_STATE_DICT, state_dict_config=ShardedStateDictConfig() + ) + return fsdp_model + + +def test_hf_rollout(n: int = 1, do_sample: bool = True, validate: bool = False): + config = OmegaConf.create(BASE_HF_ROLLOUT_CONFIG) + config.update({"n": n, "do_sample": do_sample}) + + assert torch.cuda.device_count() >= 2, "At least 2 GPUs is required to run tp+dp tests." + local_rank, rank, world_size = initialize_global_process_group() + + # Initialize model and tokenizer + local_cache_path = "~/.cache/verl/rlhf" + local_cache_path = os.path.expanduser(local_cache_path) + hdfs_path = "Qwen/Qwen2-7B-Instruct" + local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path) + tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side="left", trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + + # Initialize FSDP model + actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True) + actor_model.to(torch.bfloat16) + fsdp_model = prepare_fsdp_model(actor_model, world_size) + + # Initialize HFRollout and start generate + hf_rollout = HFRollout(fsdp_model, OmegaConf.create(config)) + input = prepare_input_dataproto(tokenizer, config, validate).to(torch.cuda.current_device()) + outputs = hf_rollout.generate_sequences(input) + + # check generated batch size is expected + generated_batch_size = outputs.batch.batch_size[0] + assert generated_batch_size == input.batch.batch_size[0] * config.n + + for i in range(generated_batch_size): + prompt_tokens = outputs.batch["prompts"][i] + prompt_mask = prompt_tokens != tokenizer.pad_token_id + prompt_tokens = prompt_tokens[prompt_mask] + decoded_prompt = tokenizer.decode(prompt_tokens, skip_special_tokens=False) + + response_tokens = outputs.batch["responses"][i] + response_mask = response_tokens != tokenizer.pad_token_id + response_tokens = response_tokens[response_mask] + decoded_response = tokenizer.decode(response_tokens, skip_special_tokens=False) + + attention_mask = outputs.batch["attention_mask"][i] + position_ids = outputs.batch["position_ids"][i] + prompt_length = outputs.batch["prompts"].size(1) + response_length = outputs.batch["responses"].size(1) + + assert attention_mask.size(0) == prompt_length + response_length + assert position_ids.size(0) == prompt_length + response_length + + # check response attention mask is expected + response_attention = attention_mask[prompt_length:] + eos_positions = (outputs.batch["responses"][i] == tokenizer.pad_token_id).nonzero(as_tuple=True)[0] + if len(eos_positions) > 0: + first_eos_pos = eos_positions[0].item() + assert response_attention[: first_eos_pos + 1].all(), "Response attention mask should be 1 until EOS" + if first_eos_pos + 1 < response_length: + assert not response_attention[first_eos_pos + 1 :].any(), ( + "Response attention mask should be 0 after EOS" + ) + else: + assert response_attention.all(), "Response attention mask should be all 1 if no EOS token" + + # check response position ids is expected + prompt_positions = position_ids[:prompt_length] + response_positions = position_ids[prompt_length:] + valid_response_length = min(len(response_tokens), response_length) + if valid_response_length > 0: + assert response_positions[0] == prompt_positions[-1] + 1 + for j in range(1, valid_response_length): + assert response_positions[j] == response_positions[j - 1] + 1 + + # print generated text for inspection + if torch.distributed.get_rank() == 0: + print(f"prompt: {decoded_prompt}") + print(f"response: {decoded_response}") + print("=" * 30) + + +if __name__ == "__main__": + test_hf_rollout(n=2, do_sample=True, validate=False) + # test_hf_rollout(n=1, do_sample=False, validate=True) + # test_hf_rollout(n=1, do_sample=True, validate=False) diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_async_rollout_multimodal_delta.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_async_rollout_multimodal_delta.py new file mode 100644 index 0000000000000000000000000000000000000000..dea1b14eaf6bf13e09f4653ff02a0b7208160794 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_async_rollout_multimodal_delta.py @@ -0,0 +1,194 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates +# Copyright 2023-2024 SGLang Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pytest + +from verl.tools.schemas import ToolResponse +from verl.utils.dataset.vision_utils import process_image +from verl.utils.tokenizer import hf_processor +from verl.workers.rollout.schemas import ( + AsyncRolloutRequest, + AsyncRolloutRequestStateEnum, + TokenizationSanityCheckModeEnum, +) + + +def _test_add_tool_response_messages_image_delta(processor, image_list, description_list, resize_image=False): + assert len(image_list) == len(description_list) + # Get the smallest dimensions across all images + processed_images = [] + for img_url in image_list: + img = process_image(img_url) + processed_images.append(img) + + min_width = min(img.size[0] for img in processed_images) + min_height = min(img.size[1] for img in processed_images) + min_size = (min_width, min_height) + + if resize_image: + processed_images_resized = [] + for img in processed_images: + img = img.resize(min_size) + processed_images_resized.append(img) + processed_images = processed_images_resized + + # Initial message history + system_prompt = ( + "You will be provided with an image. Describe this image and then generate a new image for the next round" + ) + messages = [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Here is the first image provided: "}, + {"type": "image", "image": [processed_images[0]]}, + ], + }, + ] + + # Initial multi_modal_data with one image + multi_modal_data = {"image": [processed_images[0]], "video": []} + # Minimal required fields for AsyncRolloutRequest + + req = AsyncRolloutRequest( + batch_data_id=0, + request_id="test-req-1", + state=AsyncRolloutRequestStateEnum.PENDING, + messages=messages, + multi_modal_keys=["image", "video"], + multi_modal_data=multi_modal_data.copy(), + tool_schemas=[], + tools_kwargs={}, + interaction_kwargs={}, + input_ids=None, + prompt_ids=None, + response_ids=None, + attention_mask=None, + prompt_attention_mask=None, + response_attention_mask=None, + position_ids=None, + prompt_position_ids=None, + response_position_ids=None, + loss_mask=None, + prompt_loss_mask=None, + response_loss_mask=None, + reward_scores={}, + max_prompt_len=8192, + max_response_len=8192, + max_model_len=16384, + metrics={}, + use_inference_chat_template=True, + tokenization_sanity_check_mode=TokenizationSanityCheckModeEnum.STRICT, + generation_prompt_ids=None, + base_conv_wo_gen_prompt_end_pos=0, + base_conv_with_gen_prompt_end_pos=0, + processing_class=processor, + ) + + prev_generated_len = 0 + # Add First Assistant Message and first tool response message(image) + for idx, img in enumerate(processed_images): + if idx == 0: + continue + _ = req.get_generation_prompt_ids(processor) + req.add_assistant_message(processor, content=description_list[idx - 1]) + before_tool_call_len = req.input_ids.shape[-1] + req.add_tool_response_messages( + processor, [ToolResponse(image=[img], text="Here is the new image you requested: ")] + ) + after_tool_call_len = req.input_ids.shape[-1] + if prev_generated_len == 0: + prev_generated_len = after_tool_call_len - before_tool_call_len + else: + if resize_image: + assert after_tool_call_len - before_tool_call_len == prev_generated_len + assert req.multi_modal_data["image"] == processed_images[: idx + 1] + + _ = req.get_generation_prompt_ids(processor) + req.add_assistant_message(processor, content=description_list[-1]) + + messages = [msg.model_dump() for msg in req.messages] + tools = [tool.model_dump() for tool in req.tool_schemas] if req.tool_schemas else None + full_prompt_info = req._handle_apply_chat_template( + processor, + messages, + multi_modal_data=req.multi_modal_data, + tools=tools, + add_generation_prompt=False, + tokenize=True, + return_dict=True, + ) + full_prompt_ids = full_prompt_info["input_ids"] + assert full_prompt_ids.eq(req.input_ids).all() + + # We must use dict(full_prompt_info) to convert BatchFeature values to a new dict + # because np.array() only keeps the keys for BatchFeature. + full_prompt_multi_modal_inputs = full_prompt_info.copy() + full_prompt_multi_modal_inputs.pop("input_ids", None) + full_prompt_multi_modal_inputs.pop("attention_mask", None) + + for key in full_prompt_multi_modal_inputs: + assert full_prompt_multi_modal_inputs[key].eq(req.multi_modal_inputs[key]).all() + + +@pytest.mark.skipif( + hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")) is None, + reason="Processor not available for Qwen/Qwen2.5-VL-B-Instruct", +) +def test_add_tool_response_messages_image_delta(): + processor = hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")) + + # From Qwen2.5-VL-3B-Instruct HF example + img_1_url = {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"} + img_1_description = "A woman sits on the beach at sunset, smiling as she shares a high five with her large dog." + # GitHub Logo + img_2_url = {"image": "https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png"} + img_2_description = "A GitHub Logo image" + # Octocat + img_3_url = {"image": "https://octodex.github.com/images/orderedlistocat.png"} + img_3_description = "An Octocat image" + + image_list = [img_1_url, img_2_url, img_3_url] + description_list = [img_1_description, img_2_description, img_3_description] + _test_add_tool_response_messages_image_delta(processor, image_list, description_list, resize_image=False) + + +@pytest.mark.skipif( + hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")) is None, + reason="Processor not available for Qwen/Qwen2.5-VL-B-Instruct", +) +def test_add_tool_response_messages_image_delta_resize_image(): + processor = hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")) + + # From Qwen2.5-VL-3B-Instruct HF example + img_1_url = {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"} + img_1_description = "A woman sits on the beach at sunset, smiling as she shares a high five with her large dog." + # GitHub Logo + img_2_url = {"image": "https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png"} + img_2_description = "A GitHub Logo image" + # Octocat + img_3_url = {"image": "https://octodex.github.com/images/orderedlistocat.png"} + img_3_description = "An Octocat image" + + image_list = [img_1_url, img_2_url, img_3_url] + description_list = [img_1_description, img_2_description, img_3_description] + _test_add_tool_response_messages_image_delta(processor, image_list, description_list, resize_image=True) diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_rollout_sharding_manager.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_rollout_sharding_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..0d3c7b5da2bea7c5ba757ba2b42cc30f58890eb7 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_rollout_sharding_manager.py @@ -0,0 +1,57 @@ +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from verl.workers.rollout.sglang_rollout.utils import get_named_tensor_buckets + +_TENSOR_1MB = torch.zeros(512, 512) +_BYTES_1MB = 1 << 20 + + +@pytest.mark.parametrize( + "named_tensors, bucket_size_mb, gt_groups", + [ + ( + [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)], + 0.5 * _BYTES_1MB, + [["a"], ["b"]], + ), + ( + [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)], + 1 * _BYTES_1MB, + [["a"], ["b"]], + ), + ( + [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)], + 1.5 * _BYTES_1MB, + [["a"], ["b"]], + ), + ( + [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)], + 2 * _BYTES_1MB, + [["a", "b"]], + ), + ], +) +def test_get_named_tensor_buckets(named_tensors, bucket_size_mb, gt_groups: list[list[str]]): + named_tensors_iter = iter(named_tensors) + groups = list(get_named_tensor_buckets(named_tensors_iter, bucket_size_mb)) + assert len(groups) == len(gt_groups) + for group, gt_group in zip(groups, gt_groups, strict=True): + assert len(group) == len(gt_group) + for (name, _), (gt_name) in zip(group, gt_group, strict=True): + assert name == gt_name diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_vllm_cli_args_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_vllm_cli_args_on_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..1db46ab48359087e9979d6efd6ce787913b3e5d4 --- /dev/null +++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_vllm_cli_args_on_cpu.py @@ -0,0 +1,133 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +import pytest + +from verl.workers.rollout.vllm_rollout.utils import build_cli_args_from_config + + +class TestBuildCliArgsFromConfig: + """Tests for CLI argument serialization from config dictionaries.""" + + def test_string_value(self): + """String values become '--key value'.""" + config = {"model": "gpt2"} + result = build_cli_args_from_config(config) + assert result == ["--model", "gpt2"] + + def test_integer_value(self): + """Integer values are converted to strings.""" + config = {"tensor-parallel-size": 4} + result = build_cli_args_from_config(config) + assert result == ["--tensor-parallel-size", "4"] + + def test_float_value(self): + """Float values are converted to strings.""" + config = {"temperature": 0.7} + result = build_cli_args_from_config(config) + assert result == ["--temperature", "0.7"] + + def test_bool_true(self): + """Bool True adds flag without value.""" + config = {"enable-prefix-caching": True} + result = build_cli_args_from_config(config) + assert result == ["--enable-prefix-caching"] + + def test_bool_false(self): + """Bool False is skipped entirely.""" + config = {"enable-prefix-caching": False} + result = build_cli_args_from_config(config) + assert result == [] + + def test_none_value(self): + """None values are skipped.""" + config = {"lora-path": None} + result = build_cli_args_from_config(config) + assert result == [] + + def test_list_values(self): + """List values are expanded into multiple arguments.""" + config = {"cudagraph-capture-sizes": [1, 2, 4, 8]} + result = build_cli_args_from_config(config) + assert result == ["--cudagraph-capture-sizes", "1", "2", "4", "8"] + + def test_empty_list(self): + """Empty lists are skipped (vLLM nargs='+' requires at least one value).""" + config = {"cudagraph-capture-sizes": []} + result = build_cli_args_from_config(config) + assert result == [] + + def test_list_with_strings(self): + """List of strings is properly expanded.""" + config = {"allowed-origins": ["http://localhost", "http://example.com"]} + result = build_cli_args_from_config(config) + assert result == ["--allowed-origins", "http://localhost", "http://example.com"] + + def test_dict_value(self): + """Dict values are JSON serialized.""" + config = {"extra-config": {"key": "value", "nested": True}} + result = build_cli_args_from_config(config) + assert result[0] == "--extra-config" + # JSON output may have different key ordering, so parse and compare + assert json.loads(result[1]) == {"key": "value", "nested": True} + + def test_mixed_config(self): + """Test a realistic mixed configuration.""" + config = { + "tensor-parallel-size": 4, + "enable-prefix-caching": True, + "disable-log-requests": False, + "lora-path": None, + "cudagraph-capture-sizes": [1, 2, 4, 8], + "max-model-len": 2048, + } + result = build_cli_args_from_config(config) + + # Check expected args are present + assert "--tensor-parallel-size" in result + assert "4" in result + assert "--enable-prefix-caching" in result + assert "--cudagraph-capture-sizes" in result + assert "1" in result + assert "8" in result + assert "--max-model-len" in result + assert "2048" in result + + # Check skipped values are not present + assert "--disable-log-requests" not in result + assert "--lora-path" not in result + + def test_preserves_order(self): + """Arguments should preserve dictionary order (Python 3.7+).""" + config = {"first": "a", "second": "b", "third": "c"} + result = build_cli_args_from_config(config) + assert result == ["--first", "a", "--second", "b", "--third", "c"] + + def test_empty_config(self): + """Empty config returns empty list.""" + config = {} + result = build_cli_args_from_config(config) + assert result == [] + + def test_single_element_list(self): + """Single element list works correctly.""" + config = {"sizes": [42]} + result = build_cli_args_from_config(config) + assert result == ["--sizes", "42"] + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])