diff --git "a/code/RL_model/verl/verl_train/.log" "b/code/RL_model/verl/verl_train/.log"
new file mode 100644--- /dev/null
+++ "b/code/RL_model/verl/verl_train/.log"
@@ -0,0 +1,1123 @@
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+  import pynvml  # type: ignore[import]
+INFO 02-07 12:56:22 [__init__.py:216] Automatically detected platform cuda.
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/backends.py:21: UserWarning: Apex is not installed. Falling back to Torch Norm
+  warnings.warn("Apex is not installed. Falling back to Torch Norm")
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.
+  warnings.warn(
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/optimizer.py:28: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier and multi_tensor_scale
+  warnings.warn(
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/clip_grads.py:29: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale
+  warnings.warn(
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_layer_specs.py:67: UserWarning: Apex is not installed. Falling back to Torch Norm
+  warnings.warn("Apex is not installed. Falling back to Torch Norm")
+ray init kwargs: {'num_cpus': None, 'runtime_env': {'env_vars': {'TOKENIZERS_PARALLELISM': 'true', 'NCCL_DEBUG': 'WARN', 'VLLM_LOGGING_LEVEL': 'WARN', 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': 'true', 'CUDA_DEVICE_MAX_CONNECTIONS': '1', 'NCCL_CUMEM_ENABLE': '0', 'VLLM_DISABLE_COMPILE_CACHE': '1', 'HCCL_HOST_SOCKET_PORT_RANGE': 'auto', 'HCCL_NPU_SOCKET_PORT_RANGE': 'auto'}, 'working_dir': None}}
+2026-02-07 12:56:39,494	INFO worker.py:1998 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8301 [39m[22m
+/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/worker.py:2046: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0
+  warnings.warn(
+[36m(pid=896026)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=896026)[0m   import pynvml  # type: ignore[import]
+[36m(pid=896026)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/backends.py:21: UserWarning: Apex is not installed. Falling back to Torch Norm
+[36m(pid=896026)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")
+[36m(pid=896026)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.
+[36m(pid=896026)[0m   warnings.warn(
+[36m(pid=896026)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/optimizer.py:28: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier and multi_tensor_scale
+[36m(pid=896026)[0m   warnings.warn(
+[36m(pid=896026)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/clip_grads.py:29: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale
+[36m(pid=896026)[0m   warnings.warn(
+[36m(pid=896026)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_layer_specs.py:67: UserWarning: Apex is not installed. Falling back to Torch Norm
+[36m(pid=896026)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")
+[36m(TaskRunner pid=896026)[0m TaskRunner hostname: gamma, PID: 896026
+[36m(TaskRunner pid=896026)[0m {'actor_rollout_ref': {'actor': {'_target_': 'verl.workers.config.FSDPActorConfig',
+[36m(TaskRunner pid=896026)[0m                                  'calculate_entropy': False,
+[36m(TaskRunner pid=896026)[0m                                  'calculate_sum_pi_squared': False,
+[36m(TaskRunner pid=896026)[0m                                  'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig',
+[36m(TaskRunner pid=896026)[0m                                                 'async_save': False,
+[36m(TaskRunner pid=896026)[0m                                                 'load_contents': ['model',
+[36m(TaskRunner pid=896026)[0m                                                                   'optimizer',
+[36m(TaskRunner pid=896026)[0m                                                                   'extra'],
+[36m(TaskRunner pid=896026)[0m                                                 'save_contents': ['model',
+[36m(TaskRunner pid=896026)[0m                                                                   'optimizer',
+[36m(TaskRunner pid=896026)[0m                                                                   'extra']},
+[36m(TaskRunner pid=896026)[0m                                  'clip_ratio': 0.2,
+[36m(TaskRunner pid=896026)[0m                                  'clip_ratio_c': 3.0,
+[36m(TaskRunner pid=896026)[0m                                  'clip_ratio_high': 0.2,
+[36m(TaskRunner pid=896026)[0m                                  'clip_ratio_low': 0.2,
+[36m(TaskRunner pid=896026)[0m                                  'data_loader_seed': 42,
+[36m(TaskRunner pid=896026)[0m                                  'entropy_checkpointing': False,
+[36m(TaskRunner pid=896026)[0m                                  'entropy_coeff': 0,
+[36m(TaskRunner pid=896026)[0m                                  'entropy_from_logits_with_chunking': False,
+[36m(TaskRunner pid=896026)[0m                                  'freeze_vision_tower': False,
+[36m(TaskRunner pid=896026)[0m                                  'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig',
+[36m(TaskRunner pid=896026)[0m                                                  'dtype': 'bfloat16',
+[36m(TaskRunner pid=896026)[0m                                                  'entropy_checkpointing': False,
+[36m(TaskRunner pid=896026)[0m                                                  'entropy_from_logits_with_chunking': False,
+[36m(TaskRunner pid=896026)[0m                                                  'forward_only': False,
+[36m(TaskRunner pid=896026)[0m                                                  'forward_prefetch': False,
+[36m(TaskRunner pid=896026)[0m                                                  'fsdp_size': -1,
+[36m(TaskRunner pid=896026)[0m                                                  'full_determinism': False,
+[36m(TaskRunner pid=896026)[0m                                                  'model_dtype': 'fp32',
+[36m(TaskRunner pid=896026)[0m                                                  'offload_policy': False,
+[36m(TaskRunner pid=896026)[0m                                                  'optimizer_offload': False,
+[36m(TaskRunner pid=896026)[0m                                                  'param_offload': False,
+[36m(TaskRunner pid=896026)[0m                                                  'reshard_after_forward': True,
+[36m(TaskRunner pid=896026)[0m                                                  'seed': 42,
+[36m(TaskRunner pid=896026)[0m                                                  'strategy': 'fsdp',
+[36m(TaskRunner pid=896026)[0m                                                  'ulysses_sequence_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                                  'use_orig_params': False,
+[36m(TaskRunner pid=896026)[0m                                                  'use_torch_compile': True,
+[36m(TaskRunner pid=896026)[0m                                                  'wrap_policy': {'min_num_params': 0}},
+[36m(TaskRunner pid=896026)[0m                                  'grad_clip': 1.0,
+[36m(TaskRunner pid=896026)[0m                                  'kl_loss_coef': 0.001,
+[36m(TaskRunner pid=896026)[0m                                  'kl_loss_type': 'low_var_kl',
+[36m(TaskRunner pid=896026)[0m                                  'loss_agg_mode': 'token-mean',
+[36m(TaskRunner pid=896026)[0m                                  'loss_scale_factor': None,
+[36m(TaskRunner pid=896026)[0m                                  'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig',
+[36m(TaskRunner pid=896026)[0m                                            'betas': [0.9, 0.999],
+[36m(TaskRunner pid=896026)[0m                                            'clip_grad': 1.0,
+[36m(TaskRunner pid=896026)[0m                                            'lr': 1e-06,
+[36m(TaskRunner pid=896026)[0m                                            'lr_scheduler_type': 'constant',
+[36m(TaskRunner pid=896026)[0m                                            'lr_warmup_steps': -1,
+[36m(TaskRunner pid=896026)[0m                                            'lr_warmup_steps_ratio': 0.0,
+[36m(TaskRunner pid=896026)[0m                                            'min_lr_ratio': 0.0,
+[36m(TaskRunner pid=896026)[0m                                            'num_cycles': 0.5,
+[36m(TaskRunner pid=896026)[0m                                            'optimizer': 'AdamW',
+[36m(TaskRunner pid=896026)[0m                                            'optimizer_impl': 'torch.optim',
+[36m(TaskRunner pid=896026)[0m                                            'override_optimizer_config': None,
+[36m(TaskRunner pid=896026)[0m                                            'total_training_steps': -1,
+[36m(TaskRunner pid=896026)[0m                                            'warmup_style': None,
+[36m(TaskRunner pid=896026)[0m                                            'weight_decay': 0.01},
+[36m(TaskRunner pid=896026)[0m                                  'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig',
+[36m(TaskRunner pid=896026)[0m                                                  'clip_cov_lb': 1.0,
+[36m(TaskRunner pid=896026)[0m                                                  'clip_cov_ratio': 0.0002,
+[36m(TaskRunner pid=896026)[0m                                                  'clip_cov_ub': 5.0,
+[36m(TaskRunner pid=896026)[0m                                                  'kl_cov_ratio': 0.0002,
+[36m(TaskRunner pid=896026)[0m                                                  'loss_mode': 'vanilla',
+[36m(TaskRunner pid=896026)[0m                                                  'ppo_kl_coef': 0.1},
+[36m(TaskRunner pid=896026)[0m                                  'ppo_epochs': 1,
+[36m(TaskRunner pid=896026)[0m                                  'ppo_max_token_len_per_gpu': 16384,
+[36m(TaskRunner pid=896026)[0m                                  'ppo_micro_batch_size': None,
+[36m(TaskRunner pid=896026)[0m                                  'ppo_micro_batch_size_per_gpu': 2,
+[36m(TaskRunner pid=896026)[0m                                  'ppo_mini_batch_size': 4,
+[36m(TaskRunner pid=896026)[0m                                  'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig',
+[36m(TaskRunner pid=896026)[0m                                               'all_ranks': False,
+[36m(TaskRunner pid=896026)[0m                                               'enable': False,
+[36m(TaskRunner pid=896026)[0m                                               'ranks': [],
+[36m(TaskRunner pid=896026)[0m                                               'save_path': 'outputs/profile',
+[36m(TaskRunner pid=896026)[0m                                               'tool': None,
+[36m(TaskRunner pid=896026)[0m                                               'tool_config': {'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                       'analysis': True,
+[36m(TaskRunner pid=896026)[0m                                                                       'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                                       'discrete': False,
+[36m(TaskRunner pid=896026)[0m                                                                       'level': 'level0'},
+[36m(TaskRunner pid=896026)[0m                                                               'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                        'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                               'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                         'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                                         'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                               'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                                'stack_depth': 32,
+[36m(TaskRunner pid=896026)[0m                                                                                'trace_alloc_max_entries': 100000}}},
+[36m(TaskRunner pid=896026)[0m                                  'rollout_n': 3,
+[36m(TaskRunner pid=896026)[0m                                  'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig',
+[36m(TaskRunner pid=896026)[0m                                                    'mode': 'disabled',
+[36m(TaskRunner pid=896026)[0m                                                    'record_file': None,
+[36m(TaskRunner pid=896026)[0m                                                    'replay_file': None},
+[36m(TaskRunner pid=896026)[0m                                  'shuffle': False,
+[36m(TaskRunner pid=896026)[0m                                  'strategy': 'fsdp',
+[36m(TaskRunner pid=896026)[0m                                  'sum_pi_squared_checkpointing': False,
+[36m(TaskRunner pid=896026)[0m                                  'tau_neg': 1.05,
+[36m(TaskRunner pid=896026)[0m                                  'tau_pos': 1.0,
+[36m(TaskRunner pid=896026)[0m                                  'ulysses_sequence_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                  'use_dynamic_bsz': False,
+[36m(TaskRunner pid=896026)[0m                                  'use_fused_kernels': False,
+[36m(TaskRunner pid=896026)[0m                                  'use_kl_loss': True,
+[36m(TaskRunner pid=896026)[0m                                  'use_prefix_grouper': False,
+[36m(TaskRunner pid=896026)[0m                                  'use_remove_padding': True,
+[36m(TaskRunner pid=896026)[0m                                  'use_torch_compile': True},
+[36m(TaskRunner pid=896026)[0m                        'hybrid_engine': True,
+[36m(TaskRunner pid=896026)[0m                        'model': {'_target_': 'verl.workers.config.HFModelConfig',
+[36m(TaskRunner pid=896026)[0m                                  'custom_chat_template': None,
+[36m(TaskRunner pid=896026)[0m                                  'enable_activation_offload': False,
+[36m(TaskRunner pid=896026)[0m                                  'enable_gradient_checkpointing': True,
+[36m(TaskRunner pid=896026)[0m                                  'exclude_modules': None,
+[36m(TaskRunner pid=896026)[0m                                  'external_lib': None,
+[36m(TaskRunner pid=896026)[0m                                  'fused_kernel_options': {'impl_backend': 'torch'},
+[36m(TaskRunner pid=896026)[0m                                  'hf_config_path': None,
+[36m(TaskRunner pid=896026)[0m                                  'lora_adapter_path': None,
+[36m(TaskRunner pid=896026)[0m                                  'lora_alpha': 16,
+[36m(TaskRunner pid=896026)[0m                                  'lora_rank': 0,
+[36m(TaskRunner pid=896026)[0m                                  'mtp': {'_target_': 'verl.workers.config.MtpConfig',
+[36m(TaskRunner pid=896026)[0m                                          'detach_encoder': False,
+[36m(TaskRunner pid=896026)[0m                                          'enable': False,
+[36m(TaskRunner pid=896026)[0m                                          'enable_rollout': False,
+[36m(TaskRunner pid=896026)[0m                                          'enable_train': False,
+[36m(TaskRunner pid=896026)[0m                                          'method': 'mtp',
+[36m(TaskRunner pid=896026)[0m                                          'mtp_loss_scaling_factor': 0.1,
+[36m(TaskRunner pid=896026)[0m                                          'num_speculative_tokens': 1,
+[36m(TaskRunner pid=896026)[0m                                          'speculative_algorithm': 'EAGLE',
+[36m(TaskRunner pid=896026)[0m                                          'speculative_eagle_topk': 1,
+[36m(TaskRunner pid=896026)[0m                                          'speculative_num_draft_tokens': 4,
+[36m(TaskRunner pid=896026)[0m                                          'speculative_num_steps': 3},
+[36m(TaskRunner pid=896026)[0m                                  'override_config': {},
+[36m(TaskRunner pid=896026)[0m                                  'path': 'Qwen/Qwen3-4B-Instruct-2507',
+[36m(TaskRunner pid=896026)[0m                                  'target_modules': 'all-linear',
+[36m(TaskRunner pid=896026)[0m                                  'tiled_mlp': {'enabled': False,
+[36m(TaskRunner pid=896026)[0m                                                'num_shards': 4},
+[36m(TaskRunner pid=896026)[0m                                  'tokenizer_path': None,
+[36m(TaskRunner pid=896026)[0m                                  'trust_remote_code': False,
+[36m(TaskRunner pid=896026)[0m                                  'use_fused_kernels': False,
+[36m(TaskRunner pid=896026)[0m                                  'use_liger': False,
+[36m(TaskRunner pid=896026)[0m                                  'use_remove_padding': True,
+[36m(TaskRunner pid=896026)[0m                                  'use_shm': False},
+[36m(TaskRunner pid=896026)[0m                        'nccl_timeout': 600,
+[36m(TaskRunner pid=896026)[0m                        'ref': {'_target_': 'verl.workers.config.FSDPActorConfig',
+[36m(TaskRunner pid=896026)[0m                                'entropy_checkpointing': False,
+[36m(TaskRunner pid=896026)[0m                                'entropy_from_logits_with_chunking': False,
+[36m(TaskRunner pid=896026)[0m                                'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig',
+[36m(TaskRunner pid=896026)[0m                                                'dtype': 'bfloat16',
+[36m(TaskRunner pid=896026)[0m                                                'entropy_checkpointing': False,
+[36m(TaskRunner pid=896026)[0m                                                'entropy_from_logits_with_chunking': False,
+[36m(TaskRunner pid=896026)[0m                                                'forward_only': True,
+[36m(TaskRunner pid=896026)[0m                                                'forward_prefetch': False,
+[36m(TaskRunner pid=896026)[0m                                                'fsdp_size': -1,
+[36m(TaskRunner pid=896026)[0m                                                'full_determinism': False,
+[36m(TaskRunner pid=896026)[0m                                                'model_dtype': 'fp32',
+[36m(TaskRunner pid=896026)[0m                                                'offload_policy': False,
+[36m(TaskRunner pid=896026)[0m                                                'optimizer_offload': False,
+[36m(TaskRunner pid=896026)[0m                                                'param_offload': False,
+[36m(TaskRunner pid=896026)[0m                                                'reshard_after_forward': True,
+[36m(TaskRunner pid=896026)[0m                                                'seed': 42,
+[36m(TaskRunner pid=896026)[0m                                                'strategy': 'fsdp',
+[36m(TaskRunner pid=896026)[0m                                                'ulysses_sequence_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                                'use_orig_params': False,
+[36m(TaskRunner pid=896026)[0m                                                'use_torch_compile': True,
+[36m(TaskRunner pid=896026)[0m                                                'wrap_policy': {'min_num_params': 0}},
+[36m(TaskRunner pid=896026)[0m                                'log_prob_max_token_len_per_gpu': 16384,
+[36m(TaskRunner pid=896026)[0m                                'log_prob_micro_batch_size': None,
+[36m(TaskRunner pid=896026)[0m                                'log_prob_micro_batch_size_per_gpu': 32,
+[36m(TaskRunner pid=896026)[0m                                'log_prob_use_dynamic_bsz': False,
+[36m(TaskRunner pid=896026)[0m                                'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig',
+[36m(TaskRunner pid=896026)[0m                                             'all_ranks': False,
+[36m(TaskRunner pid=896026)[0m                                             'enable': False,
+[36m(TaskRunner pid=896026)[0m                                             'ranks': [],
+[36m(TaskRunner pid=896026)[0m                                             'save_path': 'outputs/profile',
+[36m(TaskRunner pid=896026)[0m                                             'tool': None,
+[36m(TaskRunner pid=896026)[0m                                             'tool_config': {'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                     'analysis': True,
+[36m(TaskRunner pid=896026)[0m                                                                     'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                                     'discrete': False,
+[36m(TaskRunner pid=896026)[0m                                                                     'level': 'level0'},
+[36m(TaskRunner pid=896026)[0m                                                             'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                      'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                             'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                       'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                                       'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                             'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                              'stack_depth': 32,
+[36m(TaskRunner pid=896026)[0m                                                                              'trace_alloc_max_entries': 100000}}},
+[36m(TaskRunner pid=896026)[0m                                'rollout_n': 3,
+[36m(TaskRunner pid=896026)[0m                                'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig',
+[36m(TaskRunner pid=896026)[0m                                                  'mode': 'disabled',
+[36m(TaskRunner pid=896026)[0m                                                  'record_file': None,
+[36m(TaskRunner pid=896026)[0m                                                  'replay_file': None},
+[36m(TaskRunner pid=896026)[0m                                'strategy': 'fsdp',
+[36m(TaskRunner pid=896026)[0m                                'ulysses_sequence_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                'use_torch_compile': True},
+[36m(TaskRunner pid=896026)[0m                        'rollout': {'_target_': 'verl.workers.config.RolloutConfig',
+[36m(TaskRunner pid=896026)[0m                                    'agent': {'_target_': 'verl.workers.config.AgentLoopConfig',
+[36m(TaskRunner pid=896026)[0m                                              'agent_loop_config_path': None,
+[36m(TaskRunner pid=896026)[0m                                              'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig',
+[36m(TaskRunner pid=896026)[0m                                                                      'name': None,
+[36m(TaskRunner pid=896026)[0m                                                                      'path': None},
+[36m(TaskRunner pid=896026)[0m                                              'default_agent_loop': 'single_turn_agent',
+[36m(TaskRunner pid=896026)[0m                                              'num_workers': 8},
+[36m(TaskRunner pid=896026)[0m                                    'calculate_log_probs': False,
+[36m(TaskRunner pid=896026)[0m                                    'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig',
+[36m(TaskRunner pid=896026)[0m                                                          'backend': 'naive',
+[36m(TaskRunner pid=896026)[0m                                                          'engine_kwargs': {},
+[36m(TaskRunner pid=896026)[0m                                                          'update_weights_bucket_megabytes': 2048},
+[36m(TaskRunner pid=896026)[0m                                    'cudagraph_capture_sizes': None,
+[36m(TaskRunner pid=896026)[0m                                    'data_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                    'disable_log_stats': True,
+[36m(TaskRunner pid=896026)[0m                                    'do_sample': True,
+[36m(TaskRunner pid=896026)[0m                                    'dtype': 'bfloat16',
+[36m(TaskRunner pid=896026)[0m                                    'enable_chunked_prefill': True,
+[36m(TaskRunner pid=896026)[0m                                    'enable_prefix_caching': True,
+[36m(TaskRunner pid=896026)[0m                                    'enable_rollout_routing_replay': False,
+[36m(TaskRunner pid=896026)[0m                                    'enforce_eager': False,
+[36m(TaskRunner pid=896026)[0m                                    'engine_kwargs': {'sglang': {},
+[36m(TaskRunner pid=896026)[0m                                                      'trtllm': {},
+[36m(TaskRunner pid=896026)[0m                                                      'vllm': {}},
+[36m(TaskRunner pid=896026)[0m                                    'expert_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                    'free_cache_engine': True,
+[36m(TaskRunner pid=896026)[0m                                    'gpu_memory_utilization': 0.6,
+[36m(TaskRunner pid=896026)[0m                                    'ignore_eos': False,
+[36m(TaskRunner pid=896026)[0m                                    'layered_summon': False,
+[36m(TaskRunner pid=896026)[0m                                    'load_format': 'dummy',
+[36m(TaskRunner pid=896026)[0m                                    'log_prob_max_token_len_per_gpu': 16384,
+[36m(TaskRunner pid=896026)[0m                                    'log_prob_micro_batch_size': None,
+[36m(TaskRunner pid=896026)[0m                                    'log_prob_micro_batch_size_per_gpu': 2,
+[36m(TaskRunner pid=896026)[0m                                    'log_prob_use_dynamic_bsz': False,
+[36m(TaskRunner pid=896026)[0m                                    'logprobs_mode': 'processed_logprobs',
+[36m(TaskRunner pid=896026)[0m                                    'max_model_len': 8192,
+[36m(TaskRunner pid=896026)[0m                                    'max_num_batched_tokens': 8192,
+[36m(TaskRunner pid=896026)[0m                                    'max_num_seqs': 1024,
+[36m(TaskRunner pid=896026)[0m                                    'mode': 'async',
+[36m(TaskRunner pid=896026)[0m                                    'mtp': {'_target_': 'verl.workers.config.MtpConfig',
+[36m(TaskRunner pid=896026)[0m                                            'detach_encoder': False,
+[36m(TaskRunner pid=896026)[0m                                            'enable': False,
+[36m(TaskRunner pid=896026)[0m                                            'enable_rollout': False,
+[36m(TaskRunner pid=896026)[0m                                            'enable_train': False,
+[36m(TaskRunner pid=896026)[0m                                            'method': 'mtp',
+[36m(TaskRunner pid=896026)[0m                                            'mtp_loss_scaling_factor': 0.1,
+[36m(TaskRunner pid=896026)[0m                                            'num_speculative_tokens': 1,
+[36m(TaskRunner pid=896026)[0m                                            'speculative_algorithm': 'EAGLE',
+[36m(TaskRunner pid=896026)[0m                                            'speculative_eagle_topk': 1,
+[36m(TaskRunner pid=896026)[0m                                            'speculative_num_draft_tokens': 4,
+[36m(TaskRunner pid=896026)[0m                                            'speculative_num_steps': 3},
+[36m(TaskRunner pid=896026)[0m                                    'multi_stage_wake_up': False,
+[36m(TaskRunner pid=896026)[0m                                    'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig',
+[36m(TaskRunner pid=896026)[0m                                                   'enable': False,
+[36m(TaskRunner pid=896026)[0m                                                   'format': 'hermes',
+[36m(TaskRunner pid=896026)[0m                                                   'interaction_config_path': None,
+[36m(TaskRunner pid=896026)[0m                                                   'max_assistant_turns': None,
+[36m(TaskRunner pid=896026)[0m                                                   'max_parallel_calls': 1,
+[36m(TaskRunner pid=896026)[0m                                                   'max_tool_response_length': 256,
+[36m(TaskRunner pid=896026)[0m                                                   'max_user_turns': None,
+[36m(TaskRunner pid=896026)[0m                                                   'num_repeat_rollouts': None,
+[36m(TaskRunner pid=896026)[0m                                                   'tokenization_sanity_check_mode': 'strict',
+[36m(TaskRunner pid=896026)[0m                                                   'tool_config_path': None,
+[36m(TaskRunner pid=896026)[0m                                                   'tool_response_truncate_side': 'middle',
+[36m(TaskRunner pid=896026)[0m                                                   'use_inference_chat_template': False},
+[36m(TaskRunner pid=896026)[0m                                    'n': 3,
+[36m(TaskRunner pid=896026)[0m                                    'name': 'vllm',
+[36m(TaskRunner pid=896026)[0m                                    'over_sample_rate': 0,
+[36m(TaskRunner pid=896026)[0m                                    'pipeline_model_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                    'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig',
+[36m(TaskRunner pid=896026)[0m                                                 'all_ranks': False,
+[36m(TaskRunner pid=896026)[0m                                                 'enable': False,
+[36m(TaskRunner pid=896026)[0m                                                 'ranks': [],
+[36m(TaskRunner pid=896026)[0m                                                 'save_path': 'outputs/profile',
+[36m(TaskRunner pid=896026)[0m                                                 'tool': None,
+[36m(TaskRunner pid=896026)[0m                                                 'tool_config': {'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                         'analysis': True,
+[36m(TaskRunner pid=896026)[0m                                                                         'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                                         'discrete': False,
+[36m(TaskRunner pid=896026)[0m                                                                         'level': 'level0'},
+[36m(TaskRunner pid=896026)[0m                                                                 'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                          'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                                 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                           'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                                           'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                                 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                                  'stack_depth': 32,
+[36m(TaskRunner pid=896026)[0m                                                                                  'trace_alloc_max_entries': 100000}}},
+[36m(TaskRunner pid=896026)[0m                                    'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig',
+[36m(TaskRunner pid=896026)[0m                                                   'enable': False,
+[36m(TaskRunner pid=896026)[0m                                                   'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml',
+[36m(TaskRunner pid=896026)[0m                                                   'port': 9090,
+[36m(TaskRunner pid=896026)[0m                                                   'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'},
+[36m(TaskRunner pid=896026)[0m                                    'prompt_length': 1024,
+[36m(TaskRunner pid=896026)[0m                                    'quantization': None,
+[36m(TaskRunner pid=896026)[0m                                    'quantization_config_file': None,
+[36m(TaskRunner pid=896026)[0m                                    'response_length': 2048,
+[36m(TaskRunner pid=896026)[0m                                    'scheduling_policy': 'fcfs',
+[36m(TaskRunner pid=896026)[0m                                    'skip_dump_dir': '/tmp/rollout_dump',
+[36m(TaskRunner pid=896026)[0m                                    'skip_rollout': False,
+[36m(TaskRunner pid=896026)[0m                                    'skip_tokenizer_init': True,
+[36m(TaskRunner pid=896026)[0m                                    'temperature': 1.0,
+[36m(TaskRunner pid=896026)[0m                                    'tensor_model_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                    'top_k': -1,
+[36m(TaskRunner pid=896026)[0m                                    'top_p': 1,
+[36m(TaskRunner pid=896026)[0m                                    'trace': {'_target_': 'verl.workers.config.TraceConfig',
+[36m(TaskRunner pid=896026)[0m                                              'backend': None,
+[36m(TaskRunner pid=896026)[0m                                              'max_samples_per_step_per_worker': None,
+[36m(TaskRunner pid=896026)[0m                                              'token2text': False},
+[36m(TaskRunner pid=896026)[0m                                    'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig',
+[36m(TaskRunner pid=896026)[0m                                                   'do_sample': False,
+[36m(TaskRunner pid=896026)[0m                                                   'n': 1,
+[36m(TaskRunner pid=896026)[0m                                                   'temperature': 0,
+[36m(TaskRunner pid=896026)[0m                                                   'top_k': -1,
+[36m(TaskRunner pid=896026)[0m                                                   'top_p': 1.0}}},
+[36m(TaskRunner pid=896026)[0m  'algorithm': {'_target_': 'verl.trainer.config.AlgoConfig',
+[36m(TaskRunner pid=896026)[0m                'adv_estimator': 'grpo',
+[36m(TaskRunner pid=896026)[0m                'gamma': 1.0,
+[36m(TaskRunner pid=896026)[0m                'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig',
+[36m(TaskRunner pid=896026)[0m                            'horizon': 10000,
+[36m(TaskRunner pid=896026)[0m                            'kl_coef': 0.001,
+[36m(TaskRunner pid=896026)[0m                            'target_kl': 0.1,
+[36m(TaskRunner pid=896026)[0m                            'type': 'fixed'},
+[36m(TaskRunner pid=896026)[0m                'kl_penalty': 'kl',
+[36m(TaskRunner pid=896026)[0m                'lam': 1.0,
+[36m(TaskRunner pid=896026)[0m                'norm_adv_by_std_in_grpo': True,
+[36m(TaskRunner pid=896026)[0m                'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0},
+[36m(TaskRunner pid=896026)[0m                'rollout_correction': {'bypass_mode': False,
+[36m(TaskRunner pid=896026)[0m                                       'loss_type': 'ppo_clip',
+[36m(TaskRunner pid=896026)[0m                                       'rollout_is': None,
+[36m(TaskRunner pid=896026)[0m                                       'rollout_is_batch_normalize': False,
+[36m(TaskRunner pid=896026)[0m                                       'rollout_is_threshold': 2.0,
+[36m(TaskRunner pid=896026)[0m                                       'rollout_rs': None,
+[36m(TaskRunner pid=896026)[0m                                       'rollout_rs_threshold': None},
+[36m(TaskRunner pid=896026)[0m                'use_kl_in_reward': False,
+[36m(TaskRunner pid=896026)[0m                'use_pf_ppo': False},
+[36m(TaskRunner pid=896026)[0m  'critic': {'_target_': 'verl.workers.config.FSDPCriticConfig',
+[36m(TaskRunner pid=896026)[0m             'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig',
+[36m(TaskRunner pid=896026)[0m                            'async_save': False,
+[36m(TaskRunner pid=896026)[0m                            'load_contents': ['model', 'optimizer', 'extra'],
+[36m(TaskRunner pid=896026)[0m                            'save_contents': ['model', 'optimizer', 'extra']},
+[36m(TaskRunner pid=896026)[0m             'cliprange_value': 0.5,
+[36m(TaskRunner pid=896026)[0m             'data_loader_seed': 42,
+[36m(TaskRunner pid=896026)[0m             'enable': None,
+[36m(TaskRunner pid=896026)[0m             'forward_max_token_len_per_gpu': 32768,
+[36m(TaskRunner pid=896026)[0m             'forward_micro_batch_size': None,
+[36m(TaskRunner pid=896026)[0m             'forward_micro_batch_size_per_gpu': None,
+[36m(TaskRunner pid=896026)[0m             'grad_clip': 1.0,
+[36m(TaskRunner pid=896026)[0m             'loss_agg_mode': 'token-mean',
+[36m(TaskRunner pid=896026)[0m             'model': {'_target_': 'verl.workers.config.FSDPCriticModelCfg',
+[36m(TaskRunner pid=896026)[0m                       'enable_activation_offload': False,
+[36m(TaskRunner pid=896026)[0m                       'enable_gradient_checkpointing': True,
+[36m(TaskRunner pid=896026)[0m                       'external_lib': None,
+[36m(TaskRunner pid=896026)[0m                       'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig',
+[36m(TaskRunner pid=896026)[0m                                       'dtype': 'bfloat16',
+[36m(TaskRunner pid=896026)[0m                                       'entropy_checkpointing': False,
+[36m(TaskRunner pid=896026)[0m                                       'entropy_from_logits_with_chunking': False,
+[36m(TaskRunner pid=896026)[0m                                       'forward_only': False,
+[36m(TaskRunner pid=896026)[0m                                       'forward_prefetch': False,
+[36m(TaskRunner pid=896026)[0m                                       'fsdp_size': -1,
+[36m(TaskRunner pid=896026)[0m                                       'full_determinism': False,
+[36m(TaskRunner pid=896026)[0m                                       'model_dtype': 'fp32',
+[36m(TaskRunner pid=896026)[0m                                       'offload_policy': False,
+[36m(TaskRunner pid=896026)[0m                                       'optimizer_offload': False,
+[36m(TaskRunner pid=896026)[0m                                       'param_offload': False,
+[36m(TaskRunner pid=896026)[0m                                       'reshard_after_forward': True,
+[36m(TaskRunner pid=896026)[0m                                       'seed': 42,
+[36m(TaskRunner pid=896026)[0m                                       'strategy': 'fsdp',
+[36m(TaskRunner pid=896026)[0m                                       'ulysses_sequence_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                                       'use_orig_params': False,
+[36m(TaskRunner pid=896026)[0m                                       'use_torch_compile': True,
+[36m(TaskRunner pid=896026)[0m                                       'wrap_policy': {'min_num_params': 0}},
+[36m(TaskRunner pid=896026)[0m                       'lora_alpha': 16,
+[36m(TaskRunner pid=896026)[0m                       'lora_rank': 0,
+[36m(TaskRunner pid=896026)[0m                       'override_config': {},
+[36m(TaskRunner pid=896026)[0m                       'path': '~/models/deepseek-llm-7b-chat',
+[36m(TaskRunner pid=896026)[0m                       'target_modules': 'all-linear',
+[36m(TaskRunner pid=896026)[0m                       'tiled_mlp': {'enabled': False, 'num_shards': 4},
+[36m(TaskRunner pid=896026)[0m                       'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507',
+[36m(TaskRunner pid=896026)[0m                       'trust_remote_code': False,
+[36m(TaskRunner pid=896026)[0m                       'use_remove_padding': False,
+[36m(TaskRunner pid=896026)[0m                       'use_shm': False},
+[36m(TaskRunner pid=896026)[0m             'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig',
+[36m(TaskRunner pid=896026)[0m                       'betas': [0.9, 0.999],
+[36m(TaskRunner pid=896026)[0m                       'clip_grad': 1.0,
+[36m(TaskRunner pid=896026)[0m                       'lr': 1e-05,
+[36m(TaskRunner pid=896026)[0m                       'lr_scheduler_type': 'constant',
+[36m(TaskRunner pid=896026)[0m                       'lr_warmup_steps': -1,
+[36m(TaskRunner pid=896026)[0m                       'lr_warmup_steps_ratio': 0.0,
+[36m(TaskRunner pid=896026)[0m                       'min_lr_ratio': 0.0,
+[36m(TaskRunner pid=896026)[0m                       'num_cycles': 0.5,
+[36m(TaskRunner pid=896026)[0m                       'optimizer': 'AdamW',
+[36m(TaskRunner pid=896026)[0m                       'optimizer_impl': 'torch.optim',
+[36m(TaskRunner pid=896026)[0m                       'override_optimizer_config': None,
+[36m(TaskRunner pid=896026)[0m                       'total_training_steps': -1,
+[36m(TaskRunner pid=896026)[0m                       'warmup_style': None,
+[36m(TaskRunner pid=896026)[0m                       'weight_decay': 0.01},
+[36m(TaskRunner pid=896026)[0m             'ppo_epochs': 1,
+[36m(TaskRunner pid=896026)[0m             'ppo_max_token_len_per_gpu': 32768,
+[36m(TaskRunner pid=896026)[0m             'ppo_micro_batch_size': None,
+[36m(TaskRunner pid=896026)[0m             'ppo_micro_batch_size_per_gpu': None,
+[36m(TaskRunner pid=896026)[0m             'ppo_mini_batch_size': 4,
+[36m(TaskRunner pid=896026)[0m             'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig',
+[36m(TaskRunner pid=896026)[0m                          'all_ranks': False,
+[36m(TaskRunner pid=896026)[0m                          'enable': False,
+[36m(TaskRunner pid=896026)[0m                          'ranks': [],
+[36m(TaskRunner pid=896026)[0m                          'save_path': 'outputs/profile',
+[36m(TaskRunner pid=896026)[0m                          'tool': None,
+[36m(TaskRunner pid=896026)[0m                          'tool_config': {'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                  'analysis': True,
+[36m(TaskRunner pid=896026)[0m                                                  'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                  'discrete': False,
+[36m(TaskRunner pid=896026)[0m                                                  'level': 'level0'},
+[36m(TaskRunner pid=896026)[0m                                          'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                   'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                          'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                    'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                    'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                          'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                           'stack_depth': 32,
+[36m(TaskRunner pid=896026)[0m                                                           'trace_alloc_max_entries': 100000}}},
+[36m(TaskRunner pid=896026)[0m             'rollout_n': 
+[36m(TaskRunner pid=896026)[0m 3,
+[36m(TaskRunner pid=896026)[0m             'shuffle': 
+[36m(TaskRunner pid=896026)[0m False,
+[36m(TaskRunner pid=896026)[0m             'strategy': 
+[36m(TaskRunner pid=896026)[0m 'fsdp',
+[36m(TaskRunner pid=896026)[0m             'ulysses_sequence_parallel_size': 
+[36m(TaskRunner pid=896026)[0m 1,
+[36m(TaskRunner pid=896026)[0m             'use_dynamic_bsz': False},
+[36m(TaskRunner pid=896026)[0m  'custom_reward_function': 
+[36m(TaskRunner pid=896026)[0m {'name': 'compute_score',
+[36m(TaskRunner pid=896026)[0m                             'path': 
+[36m(TaskRunner pid=896026)[0m '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py'
+[36m(TaskRunner pid=896026)[0m },
+[36m(TaskRunner pid=896026)[0m  'data': 
+[36m(TaskRunner pid=896026)[0m {'apply_chat_template_kwargs': {},
+[36m(TaskRunner pid=896026)[0m           'custom_cls': {'name': None, 'path': None},
+[36m(TaskRunner pid=896026)[0m           'datagen': {'name': None, 'path': None},
+[36m(TaskRunner pid=896026)[0m           'dataloader_num_workers': 8,
+[36m(TaskRunner pid=896026)[0m           'filter_overlong_prompts': True,
+[36m(TaskRunner pid=896026)[0m           'filter_overlong_prompts_workers': 1,
+[36m(TaskRunner pid=896026)[0m           'image_key': 'images',
+[36m(TaskRunner pid=896026)[0m           'image_patch_size': 14,
+[36m(TaskRunner pid=896026)[0m           'max_prompt_length': 1024,
+[36m(TaskRunner pid=896026)[0m           'max_response_length': 2048,
+[36m(TaskRunner pid=896026)[0m           'prompt_key': 'prompt',
+[36m(TaskRunner pid=896026)[0m           'return_full_prompt': False,
+[36m(TaskRunner pid=896026)[0m           'return_multi_modal_inputs': True,
+[36m(TaskRunner pid=896026)[0m           'return_raw_chat': True,
+[36m(TaskRunner pid=896026)[0m           'return_raw_input_ids': False,
+[36m(TaskRunner pid=896026)[0m           'reward_fn_key': 'data_source',
+[36m(TaskRunner pid=896026)[0m           'sampler': {'class_name': None, 'class_path': None},
+[36m(TaskRunner pid=896026)[0m           'seed': None,
+[36m(TaskRunner pid=896026)[0m           'shuffle': True,
+[36m(TaskRunner pid=896026)[0m           'tokenizer': None,
+[36m(TaskRunner pid=896026)[0m           'tool_config_path': None,
+[36m(TaskRunner pid=896026)[0m           'train_batch_size': 8,
+[36m(TaskRunner pid=896026)[0m           'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet',
+[36m(TaskRunner pid=896026)[0m           'train_max_samples': -1,
+[36m(TaskRunner pid=896026)[0m           'truncation': 'error',
+[36m(TaskRunner pid=896026)[0m           'trust_remote_code': False,
+[36m(TaskRunner pid=896026)[0m           'use_shm': False,
+[36m(TaskRunner pid=896026)[0m           'val_batch_size': None,
+[36m(TaskRunner pid=896026)[0m           'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet',
+[36m(TaskRunner pid=896026)[0m           'val_max_samples': -1,
+[36m(TaskRunner pid=896026)[0m           'validation_shuffle': False,
+[36m(TaskRunner pid=896026)[0m           'video_key': 'videos'},
+[36m(TaskRunner pid=896026)[0m  'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig',
+[36m(TaskRunner pid=896026)[0m                      'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                      'controller_nsight_options': {'cuda-graph-trace': 'graph',
+[36m(TaskRunner pid=896026)[0m                                                                                    'cuda-memory-usage': 'true',
+[36m(TaskRunner pid=896026)[0m                                                                                    'trace': 'cuda,nvtx,cublas,ucx'},
+[36m(TaskRunner pid=896026)[0m                                                      'discrete': False,
+[36m(TaskRunner pid=896026)[0m                                                      'worker_nsight_options': {'capture-range': 'cudaProfilerApi',
+[36m(TaskRunner pid=896026)[0m                                                                                'capture-range-end': None,
+[36m(TaskRunner pid=896026)[0m                                                                                'cuda-graph-trace': 'graph',
+[36m(TaskRunner pid=896026)[0m                                                                                'cuda-memory-usage': 'true',
+[36m(TaskRunner pid=896026)[0m                                                                                'kill': 'none',
+[36m(TaskRunner pid=896026)[0m                                                                                'trace': 'cuda,nvtx,cublas,ucx'}},
+[36m(TaskRunner pid=896026)[0m                                             'torch_memory': {'context': 'all',
+[36m(TaskRunner pid=896026)[0m                                                              'kw_args': {},
+[36m(TaskRunner pid=896026)[0m                                                              'stack_depth': 32,
+[36m(TaskRunner pid=896026)[0m                                                              'stacks': 'all',
+[36m(TaskRunner pid=896026)[0m                                                              'trace_alloc_max_entries': 100000}},
+[36m(TaskRunner pid=896026)[0m                      'profile_continuous_steps': False,
+[36m(TaskRunner pid=896026)[0m                      'save_path': 'outputs/profile',
+[36m(TaskRunner pid=896026)[0m                      'steps': None,
+[36m(TaskRunner pid=896026)[0m                      'tool': None},
+[36m(TaskRunner pid=896026)[0m  'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None},
+[36m(TaskRunner pid=896026)[0m  'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig',
+[36m(TaskRunner pid=896026)[0m                     'module': {'_target_': 'verl.trainer.config.config.ModuleConfig',
+[36m(TaskRunner pid=896026)[0m                                'name': 'custom_reward_manager',
+[36m(TaskRunner pid=896026)[0m                                'path': None},
+[36m(TaskRunner pid=896026)[0m                     'name': 'naive',
+[36m(TaskRunner pid=896026)[0m                     'source': 'register'},
+[36m(TaskRunner pid=896026)[0m  'reward_model': {'enable': False,
+[36m(TaskRunner pid=896026)[0m                   'enable_resource_pool': False,
+[36m(TaskRunner pid=896026)[0m                   'forward_max_token_len_per_gpu': 32768,
+[36m(TaskRunner pid=896026)[0m                   'launch_reward_fn_async': False,
+[36m(TaskRunner pid=896026)[0m                   'max_length': None,
+[36m(TaskRunner pid=896026)[0m                   'micro_batch_size': None,
+[36m(TaskRunner pid=896026)[0m                   'micro_batch_size_per_gpu': None,
+[36m(TaskRunner pid=896026)[0m                   'model': {'external_lib': None,
+[36m(TaskRunner pid=896026)[0m                             'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig',
+[36m(TaskRunner pid=896026)[0m                                             'forward_prefetch': False,
+[36m(TaskRunner pid=896026)[0m                                             'fsdp_size': -1,
+[36m(TaskRunner pid=896026)[0m                                             'param_offload': False,
+[36m(TaskRunner pid=896026)[0m                                             'reshard_after_forward': True,
+[36m(TaskRunner pid=896026)[0m                                             'wrap_policy': {'min_num_params': 0}},
+[36m(TaskRunner pid=896026)[0m                             'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507',
+[36m(TaskRunner pid=896026)[0m                             'override_config': {},
+[36m(TaskRunner pid=896026)[0m                             'path': '~/models/FsfairX-LLaMA3-RM-v0.1',
+[36m(TaskRunner pid=896026)[0m                             'trust_remote_code': False,
+[36m(TaskRunner pid=896026)[0m                             'use_fused_kernels': False,
+[36m(TaskRunner pid=896026)[0m                             'use_remove_padding': False,
+[36m(TaskRunner pid=896026)[0m                             'use_shm': False},
+[36m(TaskRunner pid=896026)[0m                   'n_gpus_per_node': 8,
+[36m(TaskRunner pid=896026)[0m                   'nnodes': 0,
+[36m(TaskRunner pid=896026)[0m                   'num_workers': 1,
+[36m(TaskRunner pid=896026)[0m                   'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig',
+[36m(TaskRunner pid=896026)[0m                                'all_ranks': False,
+[36m(TaskRunner pid=896026)[0m                                'enable': False,
+[36m(TaskRunner pid=896026)[0m                                'ranks': [],
+[36m(TaskRunner pid=896026)[0m                                'save_path': 'outputs/profile',
+[36m(TaskRunner pid=896026)[0m                                'tool': None,
+[36m(TaskRunner pid=896026)[0m                                'tool_config': {'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                        'analysis': True,
+[36m(TaskRunner pid=896026)[0m                                                        'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                        'discrete': False,
+[36m(TaskRunner pid=896026)[0m                                                        'level': 'level0'},
+[36m(TaskRunner pid=896026)[0m                                                'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                         'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                          'contents': [],
+[36m(TaskRunner pid=896026)[0m                                                          'discrete': False},
+[36m(TaskRunner pid=896026)[0m                                                'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig',
+[36m(TaskRunner pid=896026)[0m                                                                 'stack_depth': 32,
+[36m(TaskRunner pid=896026)[0m                                                                 'trace_alloc_max_entries': 100000}}},
+[36m(TaskRunner pid=896026)[0m                   'reward_loop_class_name': None,
+[36m(TaskRunner pid=896026)[0m                   'reward_loop_module_path': None,
+[36m(TaskRunner pid=896026)[0m                   'reward_loop_source': 'register',
+[36m(TaskRunner pid=896026)[0m                   'reward_manager': 'naive',
+[36m(TaskRunner pid=896026)[0m                   'rollout': {'_target_': 'verl.workers.config.RolloutConfig',
+[36m(TaskRunner pid=896026)[0m                               'cudagraph_capture_sizes': None,
+[36m(TaskRunner pid=896026)[0m                               'data_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                               'disable_log_stats': True,
+[36m(TaskRunner pid=896026)[0m                               'dtype': 'bfloat16',
+[36m(TaskRunner pid=896026)[0m                               'enable_chunked_prefill': True,
+[36m(TaskRunner pid=896026)[0m                               'enable_prefix_caching': True,
+[36m(TaskRunner pid=896026)[0m                               'enforce_eager': True,
+[36m(TaskRunner pid=896026)[0m                               'engine_kwargs': {},
+[36m(TaskRunner pid=896026)[0m                               'expert_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                               'free_cache_engine': True,
+[36m(TaskRunner pid=896026)[0m                               'gpu_memory_utilization': 0.5,
+[36m(TaskRunner pid=896026)[0m                               'limit_images': None,
+[36m(TaskRunner pid=896026)[0m                               'load_format': 'auto',
+[36m(TaskRunner pid=896026)[0m                               'max_model_len': None,
+[36m(TaskRunner pid=896026)[0m                               'max_num_batched_tokens': 8192,
+[36m(TaskRunner pid=896026)[0m                               'max_num_seqs': 1024,
+[36m(TaskRunner pid=896026)[0m                               'name': '???',
+[36m(TaskRunner pid=896026)[0m                               'prompt_length': 2048,
+[36m(TaskRunner pid=896026)[0m                               'response_length': 2048,
+[36m(TaskRunner pid=896026)[0m                               'skip_tokenizer_init': False,
+[36m(TaskRunner pid=896026)[0m                               'tensor_model_parallel_size': 2},
+[36m(TaskRunner pid=896026)[0m                   'sandbox_fusion': {'max_concurrent': 64,
+[36m(TaskRunner pid=896026)[0m                                      'memory_limit_mb': 1024,
+[36m(TaskRunner pid=896026)[0m                                      'url': None},
+[36m(TaskRunner pid=896026)[0m                   'strategy': 'fsdp',
+[36m(TaskRunner pid=896026)[0m                   'ulysses_sequence_parallel_size': 1,
+[36m(TaskRunner pid=896026)[0m                   'use_dynamic_bsz': False,
+[36m(TaskRunner pid=896026)[0m                   'use_reward_loop': True},
+[36m(TaskRunner pid=896026)[0m  'trainer': {'balance_batch': True,
+[36m(TaskRunner pid=896026)[0m              'critic_warmup': 0,
+[36m(TaskRunner pid=896026)[0m              'default_hdfs_dir': None,
+[36m(TaskRunner pid=896026)[0m              'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2',
+[36m(TaskRunner pid=896026)[0m              'del_local_ckpt_after_load': False,
+[36m(TaskRunner pid=896026)[0m              'device': 'cuda',
+[36m(TaskRunner pid=896026)[0m              'esi_redundant_time': 0,
+[36m(TaskRunner pid=896026)[0m              'experiment_name': '',
+[36m(TaskRunner pid=896026)[0m              'log_val_generations': 0,
+[36m(TaskRunner pid=896026)[0m              'logger': ['console', 'wandb'],
+[36m(TaskRunner pid=896026)[0m              'max_actor_ckpt_to_keep': 1,
+[36m(TaskRunner pid=896026)[0m              'max_critic_ckpt_to_keep': 1,
+[36m(TaskRunner pid=896026)[0m              'n_gpus_per_node': 2,
+[36m(TaskRunner pid=896026)[0m              'nnodes': 1,
+[36m(TaskRunner pid=896026)[0m              'project_name': '',
+[36m(TaskRunner pid=896026)[0m              'ray_wait_register_center_timeout': 300,
+[36m(TaskRunner pid=896026)[0m              'remove_previous_ckpt_in_save': True,
+[36m(TaskRunner pid=896026)[0m              'resume_from_path': None,
+[36m(TaskRunner pid=896026)[0m              'resume_mode': 'auto',
+[36m(TaskRunner pid=896026)[0m              'rollout_data_dir': None,
+[36m(TaskRunner pid=896026)[0m              'save_freq': 100,
+[36m(TaskRunner pid=896026)[0m              'test_freq': 1,
+[36m(TaskRunner pid=896026)[0m              'total_epochs': 15,
+[36m(TaskRunner pid=896026)[0m              'total_training_steps': None,
+[36m(TaskRunner pid=896026)[0m              'use_legacy_worker_impl': 'auto',
+[36m(TaskRunner pid=896026)[0m              'val_before_train': True,
+[36m(TaskRunner pid=896026)[0m              'val_only': False,
+[36m(TaskRunner pid=896026)[0m              'validation_data_dir': None},
+[36m(TaskRunner pid=896026)[0m  'transfer_queue': {'enable': False}}
+[36m(TaskRunner pid=896026)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/main_ppo.py:300: UserWarning: Disabled critic as algorithm.adv_estimator != gae. If it is not intended, please set critic.enable=True
+[36m(TaskRunner pid=896026)[0m   use_critic=need_critic(config),
+[36m(TaskRunner pid=896026)[0m [validate_config] All configuration checks passed successfully!
+[36m(TaskRunner pid=896026)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(TaskRunner pid=896026)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(TaskRunner pid=896026)[0m Using dataset class: RLHFDataset
+[36m(TaskRunner pid=896026)[0m dataset len: 3226
+[36m(TaskRunner pid=896026)[0m Setting TOKENIZERS_PARALLELISM=false for forked processes.
+[36m(TaskRunner pid=896026)[0m WARNING:2026-02-07 12:57:16,729:Setting TOKENIZERS_PARALLELISM=false for forked processes.
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1):   0%|          | 0/3226 [00:00<?, ? examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1):  31%|███       | 1000/3226 [00:03<00:07, 309.79 examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1):  62%|██████▏   | 2000/3226 [00:05<00:03, 381.72 examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1):  93%|█████████▎| 3000/3226 [00:07<00:00, 418.41 examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1): 100%|██████████| 3226/3226 [00:08<00:00, 425.98 examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1): 100%|██████████| 3226/3226 [00:08<00:00, 393.10 examples/s]
+[36m(TaskRunner pid=896026)[0m filter dataset len: 3226
+[36m(TaskRunner pid=896026)[0m Using dataset class: RLHFDataset
+[36m(TaskRunner pid=896026)[0m dataset len: 170
+[36m(TaskRunner pid=896026)[0m Setting TOKENIZERS_PARALLELISM=false for forked processes.
+[36m(TaskRunner pid=896026)[0m WARNING:2026-02-07 12:57:25,438:Setting TOKENIZERS_PARALLELISM=false for forked processes.
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1):   0%|          | 0/170 [00:00<?, ? examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1): 100%|██████████| 170/170 [00:01<00:00, 104.19 examples/s]
+[36m(TaskRunner pid=896026)[0m Filtering prompts longer than 1024 tokens (num_proc=1): 100%|██████████| 170/170 [00:01<00:00, 92.90 examples/s] 
+[36m(TaskRunner pid=896026)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/ppo/ray_trainer.py:291: UserWarning: Disabled critic as algorithm.adv_estimator != gae. If it is not intended, please set critic.enable=True
+[36m(TaskRunner pid=896026)[0m   self.use_critic = need_critic(self.config)
+[36m(TaskRunner pid=896026)[0m filter dataset len: 170
+[36m(TaskRunner pid=896026)[0m Size of train dataloader: 403, Size of val dataloader: 1
+[36m(TaskRunner pid=896026)[0m Total training steps: 6045
+[36m(TaskRunner pid=896026)[0m colocated worker base class <class 'verl.single_controller.base.worker.Worker'>
+[36m(pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=897656)[0m   import pynvml  # type: ignore[import]
+[36m(pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/backends.py:21: UserWarning: Apex is not installed. Falling back to Torch Norm
+[36m(pid=897656)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")
+[36m(pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.
+[36m(pid=897656)[0m   warnings.warn(
+[36m(pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/optimizer.py:28: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier and multi_tensor_scale
+[36m(pid=897656)[0m   warnings.warn(
+[36m(pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/clip_grads.py:29: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale
+[36m(pid=897656)[0m   warnings.warn(
+[36m(pid=897657)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=897657)[0m   import pynvml  # type: ignore[import]
+[36m(pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_layer_specs.py:67: UserWarning: Apex is not installed. Falling back to Torch Norm
+[36m(pid=897656)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")
+[36m(pid=897657)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_layer_specs.py:67: UserWarning: Apex is not installed. Falling back to Torch Norm[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
+[36m(pid=897657)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")[32m [repeated 2x across cluster][0m
+[36m(pid=897657)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.
+[36m(pid=897657)[0m   warnings.warn([32m [repeated 3x across cluster][0m
+[36m(pid=897657)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/optimizer.py:28: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier and multi_tensor_scale
+[36m(pid=897657)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/clip_grads.py:29: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale
+[36m(WorkerDict pid=897656)[0m [Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
+[36m(WorkerDict pid=897656)[0m reference model: Qwen/Qwen3-4B-Instruct-2507
+[36m(WorkerDict pid=897657)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(WorkerDict pid=897657)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(WorkerDict pid=897656)[0m Model config after override: Qwen3Config {
+[36m(WorkerDict pid=897656)[0m   "architectures": [
+[36m(WorkerDict pid=897656)[0m     "Qwen3ForCausalLM"
+[36m(WorkerDict pid=897656)[0m   ],
+[36m(WorkerDict pid=897656)[0m   "attention_bias": false,
+[36m(WorkerDict pid=897656)[0m   "attention_dropout": 0.0,
+[36m(WorkerDict pid=897656)[0m   "dtype": "bfloat16",
+[36m(WorkerDict pid=897656)[0m   "eos_token_id": 151645,
+[36m(WorkerDict pid=897656)[0m   "head_dim": 128,
+[36m(WorkerDict pid=897656)[0m   "hidden_act": "silu",
+[36m(WorkerDict pid=897656)[0m   "hidden_size": 2560,
+[36m(WorkerDict pid=897656)[0m   "initializer_range": 0.02,
+[36m(WorkerDict pid=897656)[0m   "intermediate_size": 9728,
+[36m(WorkerDict pid=897656)[0m   "layer_types": [
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention"
+[36m(WorkerDict pid=897656)[0m   ],
+[36m(WorkerDict pid=897656)[0m   "max_position_embeddings": 262144,
+[36m(WorkerDict pid=897656)[0m   "max_window_layers": 36,
+[36m(WorkerDict pid=897656)[0m   "model_type": "qwen3",
+[36m(WorkerDict pid=897656)[0m   "num_attention_heads": 32,
+[36m(WorkerDict pid=897656)[0m   "num_hidden_layers": 36,
+[36m(WorkerDict pid=897656)[0m   "num_key_value_heads": 8,
+[36m(WorkerDict pid=897656)[0m   "pad_token_id": 151643,
+[36m(WorkerDict pid=897656)[0m   "rms_norm_eps": 1e-06,
+[36m(WorkerDict pid=897656)[0m   "rope_scaling": null,
+[36m(WorkerDict pid=897656)[0m   "rope_theta": 5000000,
+[36m(WorkerDict pid=897656)[0m   "sliding_window": null,
+[36m(WorkerDict pid=897656)[0m   "tie_word_embeddings": true,
+[36m(WorkerDict pid=897656)[0m   "transformers_version": "4.56.1",
+[36m(WorkerDict pid=897656)[0m   "use_cache": true,
+[36m(WorkerDict pid=897656)[0m   "use_sliding_window": false,
+[36m(WorkerDict pid=897656)[0m   "vocab_size": 151936
+[36m(WorkerDict pid=897656)[0m }
+[36m(WorkerDict pid=897656)[0m 
+[36m(WorkerDict pid=897657)[0m `torch_dtype` is deprecated! Use `dtype` instead!
+[36m(WorkerDict pid=897657)[0m Flash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen3ForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`
+[36m(WorkerDict pid=897657)[0m Flash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen3Model is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
+[36m(WorkerDict pid=897656)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(WorkerDict pid=897656)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards:  33%|███▎      | 1/3 [00:06<00:12,  6.30s/it]
+[36m(WorkerDict pid=897656)[0m `torch_dtype` is deprecated! Use `dtype` instead!
+[36m(WorkerDict pid=897656)[0m Flash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen3Model is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`[32m [repeated 2x across cluster][0m
+[36m(WorkerDict pid=897656)[0m Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards:  67%|██████▋   | 2/3 [00:12<00:06,  6.32s/it][32m [repeated 2x across cluster][0m
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  3.51s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.26s/it]
+[36m(WorkerDict pid=897657)[0m Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
+[36m(WorkerDict pid=897657)[0m Skipping monkey patch for Qwen3ForCausalLM as use_fused_kernels is False or fused_kernels_backend is torch
+[36m(WorkerDict pid=897657)[0m [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
+[36m(WorkerDict pid=897656)[0m Qwen3ForCausalLM contains 4.02B parameters
+[36m(WorkerDict pid=897656)[0m wrap_policy: functools.partial(<function _or_policy at 0x7f9230fa54e0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7f9230fa53a0>, transformer_layer_cls={<class 'transformers.models.qwen3.modeling_qwen3.Qwen3DecoderLayer'>})])
+[36m(WorkerDict pid=897656)[0m NCCL version 2.27.3+cuda12.9
+[36m(WorkerDict pid=897656)[0m 
+[36m(WorkerDict pid=897656)[0m [2026-02-07 12:58:25] gamma:897656:899170 [0] ras/client_support.cc:160 NCCL WARN Call to bind failed: Address already in use
+[36m(WorkerDict pid=897656)[0m Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
+[36m(WorkerDict pid=897656)[0m Skipping monkey patch for Qwen3ForCausalLM as use_fused_kernels is False or fused_kernels_backend is torch
+[36m(WorkerDict pid=897657)[0m 
+[36m(WorkerDict pid=897656)[0m Ref use_remove_padding=True
+[36m(WorkerDict pid=897656)[0m Ref use_fused_kernels=False
+[36m(WorkerDict pid=897656)[0m Ref use_prefix_grouper=False
+[36m(WorkerDict pid=897657)[0m [2026-02-07 12:58:25] gamma:897657:899174 [0] ras/client_support.cc:160 NCCL WARN Call to bind failed: Address already in use
+[36m(WorkerDict pid=897656)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(WorkerDict pid=897656)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(WorkerDict pid=897656)[0m Loading checkpoint shards:  67%|██████▋   | 2/3 [00:13<00:06,  6.60s/it]
+[36m(WorkerDict pid=897656)[0m Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  3.64s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  4.49s/it]
+[36m(WorkerDict pid=897656)[0m Model config after override: Qwen3Config {
+[36m(WorkerDict pid=897656)[0m   "architectures": [
+[36m(WorkerDict pid=897656)[0m     "Qwen3ForCausalLM"
+[36m(WorkerDict pid=897656)[0m   ],
+[36m(WorkerDict pid=897656)[0m   "attention_bias": false,
+[36m(WorkerDict pid=897656)[0m   "attention_dropout": 0.0,
+[36m(WorkerDict pid=897656)[0m   "dtype": "bfloat16",
+[36m(WorkerDict pid=897656)[0m   "eos_token_id": 151645,
+[36m(WorkerDict pid=897656)[0m   "head_dim": 128,
+[36m(WorkerDict pid=897656)[0m   "hidden_act": "silu",
+[36m(WorkerDict pid=897656)[0m   "hidden_size": 2560,
+[36m(WorkerDict pid=897656)[0m   "initializer_range": 0.02,
+[36m(WorkerDict pid=897656)[0m   "intermediate_size": 9728,
+[36m(WorkerDict pid=897656)[0m   "layer_types": [
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention",
+[36m(WorkerDict pid=897656)[0m     "full_attention"
+[36m(WorkerDict pid=897656)[0m   ],
+[36m(WorkerDict pid=897656)[0m   "max_position_embeddings": 262144,
+[36m(WorkerDict pid=897656)[0m   "max_window_layers": 36,
+[36m(WorkerDict pid=897656)[0m   "model_type": "qwen3",
+[36m(WorkerDict pid=897656)[0m   "num_attention_heads": 32,
+[36m(WorkerDict pid=897656)[0m   "num_hidden_layers": 36,
+[36m(WorkerDict pid=897656)[0m   "num_key_value_heads": 8,
+[36m(WorkerDict pid=897656)[0m   "pad_token_id": 151643,
+[36m(WorkerDict pid=897656)[0m   "rms_norm_eps": 1e-06,
+[36m(WorkerDict pid=897656)[0m   "rope_scaling": null,
+[36m(WorkerDict pid=897656)[0m   "rope_theta": 5000000,
+[36m(WorkerDict pid=897656)[0m   "sliding_window": null,
+[36m(WorkerDict pid=897656)[0m   "tie_word_embeddings": true,
+[36m(WorkerDict pid=897656)[0m   "transformers_version": "4.56.1",
+[36m(WorkerDict pid=897656)[0m   "use_cache": true,
+[36m(WorkerDict pid=897656)[0m   "use_sliding_window": false,
+[36m(WorkerDict pid=897656)[0m   "vocab_size": 151936
+[36m(WorkerDict pid=897656)[0m }
+[36m(WorkerDict pid=897656)[0m 
+[36m(WorkerDict pid=897656)[0m Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards:  33%|███▎      | 1/3 [00:06<00:12,  6.25s/it]
+[36m(WorkerDict pid=897657)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(WorkerDict pid=897657)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards:  67%|██████▋   | 2/3 [00:12<00:06,  6.45s/it][32m [repeated 2x across cluster][0m
+[36m(WorkerDict pid=897657)[0m Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  3.58s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  4.34s/it]
+[36m(WorkerDict pid=897657)[0m Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
+[36m(WorkerDict pid=897657)[0m Skipping monkey patch for Qwen3ForCausalLM as use_fused_kernels is False or fused_kernels_backend is torch
+[36m(WorkerDict pid=897656)[0m Qwen3ForCausalLM contains 4.02B parameters
+[36m(WorkerDict pid=897656)[0m wrap_policy: functools.partial(<function _or_policy at 0x7f9230fa54e0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7f9230fa53a0>, transformer_layer_cls={<class 'transformers.models.qwen3.modeling_qwen3.Qwen3DecoderLayer'>})])
+[36m(WorkerDict pid=897656)[0m Total steps: 6045, num_warmup_steps: 0
+[36m(WorkerDict pid=897656)[0m Actor use_remove_padding=True
+[36m(WorkerDict pid=897656)[0m Actor use_fused_kernels=False
+[36m(WorkerDict pid=897656)[0m Actor use_prefix_grouper=False
+[36m(WorkerDict pid=897656)[0m Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
+[36m(WorkerDict pid=897656)[0m Skipping monkey patch for Qwen3ForCausalLM as use_fused_kernels is False or fused_kernels_backend is torch
+[36m(WorkerDict pid=897657)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(WorkerDict pid=897657)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(WorkerDict pid=897656)[0m Loading checkpoint shards:  67%|██████▋   | 2/3 [00:13<00:06,  6.91s/it]
+[36m(WorkerDict pid=897656)[0m Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  3.81s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  4.65s/it]
+[36m(WorkerDict pid=897656)[0m [Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
+[36m(WorkerDict pid=897656)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(WorkerDict pid=897656)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(WorkerDict pid=897656)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:678: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+[36m(WorkerDict pid=897656)[0m   warnings.warn(
+[36m(WorkerDict pid=897656)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(WorkerDict pid=897656)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(TaskRunner pid=896026)[0m WARNING 02-07 12:59:32 [api_server.py:1213] LoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!
+[36m(WorkerDict pid=897657)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0[32m [repeated 3x across cluster][0m
+[36m(pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=901562)[0m   import pynvml  # type: ignore[import]
+[36m(WorkerDict pid=897657)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:678: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+[36m(WorkerDict pid=897657)[0m   warnings.warn(
+[36m(pid=901562)[0m WARNING 02-07 12:59:59 [api_server.py:1213] LoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!
+[36m(pid=901571)[0m WARNING 02-07 12:59:59 [api_server.py:1213] LoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/backends.py:21: UserWarning: Apex is not installed. Falling back to Torch Norm
+[36m(vLLMHttpServer pid=901562)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")
+[36m(pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=901571)[0m   import pynvml  # type: ignore[import]
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.
+[36m(vLLMHttpServer pid=901562)[0m   warnings.warn(
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/optimizer.py:28: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier and multi_tensor_scale
+[36m(vLLMHttpServer pid=901562)[0m   warnings.warn(
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/clip_grads.py:29: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale
+[36m(vLLMHttpServer pid=901562)[0m   warnings.warn(
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_layer_specs.py:67: UserWarning: Apex is not installed. Falling back to Torch Norm
+[36m(vLLMHttpServer pid=901562)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")
+[36m(vLLMHttpServer pid=901562)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(vLLMHttpServer pid=901562)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(vLLMHttpServer pid=901562)[0m WARNING:2026-02-07 13:00:03,461:agent loop only support torch and npu profiler, got None
+[36m(vLLMHttpServer pid=901562)[0m INFO:2026-02-07 13:00:03,463:vLLMHttpServer, replica_rank: 1, node_rank: 0, CUDA_VISIBLE_DEVICES: 3, master_address: 172.16.34.29, master_port: 32975, data_parallel_rpc_port: 43977, data_parallel_master_port: 36417
+[36m(vLLMHttpServer pid=901562)[0m INFO:2026-02-07 13:00:03,480:override_generation_config: {'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'repetition_penalty': 1.0, 'max_new_tokens': 2048}
+[36m(vLLMHttpServer pid=901562)[0m INFO:2026-02-07 13:00:03,480:enable_sleep_mode: True
+[36m(vLLMHttpServer pid=901562)[0m `torch_dtype` is deprecated! Use `dtype` instead!
+[36m(vLLMHttpServer pid=901562)[0m WARNING 02-07 13:00:05 [__init__.py:3036] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: In a Ray actor and can only be spawned; CUDA is initialized
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(vLLMHttpServer pid=901562)[0m   import pynvml  # type: ignore[import]
+[36m(vLLMHttpServer pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_layer_specs.py:67: UserWarning: Apex is not installed. Falling back to Torch Norm[32m [repeated 2x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m   warnings.warn("Apex is not installed. Falling back to Torch Norm")[32m [repeated 2x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.
+[36m(vLLMHttpServer pid=901571)[0m   warnings.warn([32m [repeated 3x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/optimizer.py:28: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier and multi_tensor_scale
+[36m(vLLMHttpServer pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/megatron/core/optimizer/clip_grads.py:29: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale
+[36m(vLLMHttpServer pid=901562)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(vLLMHttpServer pid=901562)[0m   import pynvml  # type: ignore[import]
+[36m(vLLMHttpServer pid=901571)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(vLLMHttpServer pid=901571)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(vLLMHttpServer pid=901571)[0m WARNING:2026-02-07 13:00:30,122:agent loop only support torch and npu profiler, got None
+[36m(vLLMHttpServer pid=901571)[0m INFO:2026-02-07 13:00:30,124:vLLMHttpServer, replica_rank: 0, node_rank: 0, CUDA_VISIBLE_DEVICES: 2, master_address: 172.16.34.29, master_port: 40643, data_parallel_rpc_port: 37497, data_parallel_master_port: 45967
+[36m(vLLMHttpServer pid=901571)[0m INFO:2026-02-07 13:00:30,143:override_generation_config: {'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'repetition_penalty': 1.0, 'max_new_tokens': 2048}
+[36m(vLLMHttpServer pid=901571)[0m INFO:2026-02-07 13:00:30,143:enable_sleep_mode: True
+[36m(vLLMHttpServer pid=901571)[0m ['serve',
+[36m(vLLMHttpServer pid=901571)[0m  'Qwen/Qwen3-4B-Instruct-2507',
+[36m(vLLMHttpServer pid=901571)[0m  '--dtype',
+[36m(vLLMHttpServer pid=901571)[0m  'bfloat16',
+[36m(vLLMHttpServer pid=901571)[0m  '--load_format',
+[36m(vLLMHttpServer pid=901571)[0m  'dummy',
+[36m(vLLMHttpServer pid=901571)[0m  '--distributed_executor_backend',
+[36m(vLLMHttpServer pid=901571)[0m  'mp',
+[36m(vLLMHttpServer pid=901571)[0m  '--worker_extension_cls',
+[36m(vLLMHttpServer pid=901571)[0m  'verl.workers.rollout.vllm_rollout.utils.vLLMColocateWorkerExtension',
+[36m(vLLMHttpServer pid=901571)[0m  '--max_model_len',
+[36m(vLLMHttpServer pid=901571)[0m  '8192',
+[36m(vLLMHttpServer pid=901571)[0m  '--max_num_seqs',
+[36m(vLLMHttpServer pid=901571)[0m  '1024',
+[36m(vLLMHttpServer pid=901571)[0m  '--enable_chunked_prefill',
+[36m(vLLMHttpServer pid=901571)[0m  '--max_num_batched_tokens',
+[36m(vLLMHttpServer pid=901571)[0m  '8192',
+[36m(vLLMHttpServer pid=901571)[0m  '--enable_prefix_caching',
+[36m(vLLMHttpServer pid=901571)[0m  '--enable_sleep_mode',
+[36m(vLLMHttpServer pid=901571)[0m  '--logprobs_mode',
+[36m(vLLMHttpServer pid=901571)[0m  'processed_logprobs',
+[36m(vLLMHttpServer pid=901571)[0m  '--gpu_memory_utilization',
+[36m(vLLMHttpServer pid=901571)[0m  '0.6',
+[36m(vLLMHttpServer pid=901571)[0m  '--disable_log_stats',
+[36m(vLLMHttpServer pid=901571)[0m  '--tensor_parallel_size',
+[36m(vLLMHttpServer pid=901571)[0m  '1',
+[36m(vLLMHttpServer pid=901571)[0m  '--seed',
+[36m(vLLMHttpServer pid=901571)[0m  '0',
+[36m(vLLMHttpServer pid=901571)[0m  '--override_generation_config',
+[36m(vLLMHttpServer pid=901571)[0m  '{"temperature": 1.0, "top_k": -1, "top_p": 1, "repetition_penalty": 1.0, '
+[36m(vLLMHttpServer pid=901571)[0m  '"max_new_tokens": 2048}',
+[36m(vLLMHttpServer pid=901571)[0m  '--hf_overrides',
+[36m(vLLMHttpServer pid=901571)[0m  '{}',
+[36m(vLLMHttpServer pid=901571)[0m  '--scheduling_policy',
+[36m(vLLMHttpServer pid=901571)[0m  'fcfs',
+[36m(vLLMHttpServer pid=901571)[0m  '--compilation_config',
+[36m(vLLMHttpServer pid=901571)[0m  '{"cudagraph_mode": "FULL_AND_PIECEWISE"}']
+[36m(vLLMHttpServer pid=901571)[0m `torch_dtype` is deprecated! Use `dtype` instead!
+[36m(vLLMHttpServer pid=901571)[0m WARNING 02-07 13:00:31 [__init__.py:3036] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: In a Ray actor and can only be spawned; CUDA is initialized
+[36m(vLLMHttpServer pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(vLLMHttpServer pid=901571)[0m   import pynvml  # type: ignore[import]
+[36m(vLLMHttpServer pid=901562)[0m W0207 13:00:51.123000 903087 /data/home_beta/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
+[36m(vLLMHttpServer pid=901562)[0m W0207 13:00:51.123000 903087 /data/home_beta/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[36m(vLLMHttpServer pid=901571)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(vLLMHttpServer pid=901571)[0m   import pynvml  # type: ignore[import]
+[36m(vLLMHttpServer pid=901562)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(vLLMHttpServer pid=901562)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(vLLMHttpServer pid=901562)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(vLLMHttpServer pid=901562)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(vLLMHttpServer pid=901562)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(vLLMHttpServer pid=901562)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[36m(vLLMHttpServer pid=901571)[0m W0207 13:01:17.192000 903817 /data/home_beta/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
+[36m(vLLMHttpServer pid=901571)[0m W0207 13:01:17.192000 903817 /data/home_beta/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[36m(vLLMHttpServer pid=901571)[0m [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0[32m [repeated 6x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m [1;36m(Worker pid=903087)[0;0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/67 [00:00<?, ?it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   1%|▏         | 1/67 [00:00<00:54,  1.20it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   3%|▎         | 2/67 [00:00<00:27,  2.39it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|▍         | 3/67 [00:02<00:47,  1.35it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   6%|▌         | 4/67 [00:02<00:31,  1.99it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   9%|▉         | 6/67 [00:03<00:33,  1.80it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  10%|█         | 7/67 [00:03<00:25,  2.31it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  13%|█▎        | 9/67 [00:03<00:16,  3.54it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  15%|█▍        | 10/67 [00:03<00:13,  4.12it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|█▋        | 11/67 [00:04<00:26,  2.10it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  19%|█▉        | 13/67 [00:06<00:26,  2.02it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  21%|██        | 14/67 [00:06<00:23,  2.25it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  22%|██▏       | 15/67 [00:06<00:19,  2.71it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  24%|██▍       | 16/67 [00:07<00:24,  2.12it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  25%|██▌       | 17/67 [00:07<00:24,  2.06it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  28%|██▊       | 19/67 [00:08<00:26,  1.84it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  30%|██▉       | 20/67 [00:09<00:20,  2.28it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  33%|███▎      | 22/67 [00:10<00:21,  2.11it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  34%|███▍      | 23/67 [00:10<00:18,  2.36it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  37%|███▋      | 25/67 [00:11<00:21,  1.98it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  39%|███▉      | 26/67 [00:11<00:17,  2.35it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  42%|████▏     | 28/67 [00:12<00:18,  2.12it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  43%|████▎     | 29/67 [00:13<00:15,  2.43it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  46%|████▋     | 31/67 [00:13<00:10,  3.39it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  48%|████▊     | 32/67 [00:13<00:08,  3.92it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  49%|████▉     | 33/67 [00:14<00:15,  2.14it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  51%|█████     | 34/67 [00:14<00:12,  2.63it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  52%|█████▏    | 35/67 [00:15<00:18,  1.69it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  54%|█████▎    | 36/67 [00:15<00:14,  2.15it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  55%|█████▌    | 37/67 [00:16<00:10,  2.75it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  58%|█████▊    | 39/67 [00:17<00:12,  2.24it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  60%|█████▉    | 40/67 [00:17<00:12,  2.16it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  61%|██████    | 41/67 [00:18<00:16,  1.58it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  63%|██████▎   | 42/67 [00:18<00:12,  2.02it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  64%|██████▍   | 43/67 [00:19<00:09,  2.53it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  66%|██████▌   | 44/67 [00:19<00:10,  2.26it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  67%|██████▋   | 45/67 [00:20<00:11,  1.89it/s]
+[36m(vLLMHttpServer pid=901571)[0m [1;36m(Worker pid=903817)[0;0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/67 [00:00<?, ?it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  69%|██████▊   | 46/67 [00:21<00:12,  1.68it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  73%|███████▎  | 49/67 [00:22<00:10,  1.67it/s][32m [repeated 4x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  91%|█████████ | 61/67 [00:28<00:03,  1.90it/s][32m [repeated 18x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  93%|█████████▎| 62/67 [00:28<00:02,  2.11it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  96%|█████████▌| 64/67 [00:28<00:00,  3.12it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  97%|█████████▋| 65/67 [00:28<00:00,  3.64it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  99%|█████████▊| 66/67 [00:29<00:00,  3.48it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:29<00:00,  2.22it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:29<00:00,  2.23it/s]
+[36m(vLLMHttpServer pid=901562)[0m [1;36m(Worker pid=903087)[0;0m Capturing CUDA graphs (decode, FULL):   0%|          | 0/67 [00:00<?, ?it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):   1%|▏         | 1/67 [00:00<00:07,  9.43it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):   3%|▎         | 2/67 [00:01<00:44,  1.45it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):   4%|▍         | 3/67 [00:01<00:27,  2.34it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):   7%|▋         | 5/67 [00:01<00:14,  4.29it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):   9%|▉         | 6/67 [00:02<00:19,  3.19it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  10%|█         | 7/67 [00:02<00:24,  2.49it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  12%|█▏        | 8/67 [00:02<00:18,  3.14it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  13%|█▎        | 9/67 [00:03<00:29,  1.94it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  39%|███▉      | 26/67 [00:11<00:20,  1.96it/s][32m [repeated 10x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  15%|█▍        | 10/67 [00:03<00:25,  2.27it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  16%|█▋        | 11/67 [00:04<00:19,  2.95it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  19%|█▉        | 13/67 [00:04<00:12,  4.45it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  21%|██        | 14/67 [00:05<00:23,  2.23it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  24%|██▍       | 16/67 [00:06<00:26,  1.96it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  27%|██▋       | 18/67 [00:06<00:17,  2.77it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  28%|██▊       | 19/67 [00:06<00:14,  3.24it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  31%|███▏      | 21/67 [00:08<00:19,  2.36it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  34%|███▍      | 23/67 [00:09<00:21,  2.05it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  60%|█████▉    | 40/67 [00:17<00:11,  2.32it/s][32m [repeated 9x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  36%|███▌      | 24/67 [00:09<00:17,  2.42it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  37%|███▋      | 25/67 [00:09<00:14,  2.88it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  39%|███▉      | 26/67 [00:09<00:11,  3.47it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  40%|████      | 27/67 [00:10<00:13,  2.87it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  42%|████▏     | 28/67 [00:10<00:17,  2.23it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  43%|████▎     | 29/67 [00:11<00:22,  1.71it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  45%|████▍     | 30/67 [00:12<00:18,  2.05it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  48%|████▊     | 32/67 [00:12<00:10,  3.25it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  51%|█████     | 34/67 [00:13<00:13,  2.46it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  52%|█████▏    | 35/67 [00:13<00:11,  2.79it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  54%|█████▎    | 36/67 [00:13<00:10,  2.90it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  55%|█████▌    | 37/67 [00:14<00:14,  2.06it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  76%|███████▌  | 51/67 [00:22<00:06,  2.41it/s][32m [repeated 10x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  57%|█████▋    | 38/67 [00:14<00:11,  2.61it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  60%|█████▉    | 40/67 [00:15<00:06,  4.01it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  61%|██████    | 41/67 [00:16<00:12,  2.15it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  64%|██████▍   | 43/67 [00:16<00:07,  3.14it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  66%|██████▌   | 44/67 [00:17<00:11,  2.07it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  67%|██████▋   | 45/67 [00:17<00:09,  2.33it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  70%|███████   | 47/67 [00:18<00:09,  2.03it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  93%|█████████▎| 62/67 [00:28<00:02,  2.05it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  72%|███████▏  | 48/67 [00:19<00:07,  2.47it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  75%|███████▍  | 50/67 [00:19<00:04,  3.53it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  96%|█████████▌| 64/67 [00:28<00:01,  2.89it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  76%|███████▌  | 51/67 [00:19<00:03,  4.02it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  97%|█████████▋| 65/67 [00:28<00:00,  3.35it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  78%|███████▊  | 52/67 [00:20<00:06,  2.38it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|████████▉ | 60/67 [00:26<00:02,  2.47it/s][32m [repeated 7x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:29<00:00,  2.44it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:29<00:00,  2.26it/s]
+[36m(vLLMHttpServer pid=901571)[0m [1;36m(Worker pid=903817)[0;0m Capturing CUDA graphs (decode, FULL):   0%|          | 0/67 [00:00<?, ?it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  79%|███████▉  | 53/67 [00:20<00:05,  2.65it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  90%|████████▉ | 60/67 [00:23<00:02,  2.59it/s][32m [repeated 9x across cluster][0m
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  94%|█████████▍| 63/67 [00:23<00:00,  4.25it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  96%|█████████▌| 64/67 [00:23<00:00,  4.00it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL):  97%|█████████▋| 65/67 [00:24<00:00,  2.54it/s]
+[36m(vLLMHttpServer pid=901562)[0m Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:24<00:00,  3.73it/s]Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:24<00:00,  2.70it/s]
+[36m(vLLMHttpServer pid=901562)[0m WARNING 02-07 13:02:53 [model.py:1389] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  31%|███▏      | 21/67 [00:08<00:22,  2.04it/s][32m [repeated 12x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  57%|█████▋    | 38/67 [00:13<00:12,  2.38it/s][32m [repeated 12x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  81%|████████  | 54/67 [00:19<00:04,  2.71it/s][32m [repeated 11x across cluster][0m
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  93%|█████████▎| 62/67 [00:21<00:01,  2.99it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  94%|█████████▍| 63/67 [00:22<00:01,  2.89it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  97%|█████████▋| 65/67 [00:22<00:00,  4.06it/s]
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  99%|█████████▊| 66/67 [00:23<00:00,  2.34it/s]Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:23<00:00,  2.84it/s]
+[36m(vLLMHttpServer pid=901571)[0m WARNING 02-07 13:03:12 [model.py:1389] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
+[36m(vLLMHttpServer pid=901571)[0m INFO:2026-02-07 13:03:12,697:Initializing a V1 LLM engine with config: model='Qwen/Qwen3-4B-Instruct-2507', speculative_config=None, tokenizer='Qwen/Qwen3-4B-Instruct-2507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=dummy, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=Qwen/Qwen3-4B-Instruct-2507, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[36m(vLLMHttpServer pid=901571)[0m Capturing CUDA graphs (decode, FULL):  90%|████████▉ | 60/67 [00:21<00:01,  3.52it/s][32m [repeated 5x across cluster][0m
+[36m(TaskRunner pid=896026)[0m AgentLoopManager: ['172.16.34.29:36859', '172.16.34.29:33941']
+[36m(TaskRunner pid=896026)[0m wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/mshahidul/.netrc.
+[36m(TaskRunner pid=896026)[0m wandb: Currently logged in as: shahidulshakib034 (shahidulshakib034-khulna-university-of-engineering-techn) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+[36m(TaskRunner pid=896026)[0m wandb: setting up run f14af5tw
+[36m(pid=907702)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=907702)[0m   import pynvml  # type: ignore[import]
+[36m(TaskRunner pid=896026)[0m wandb: Tracking run with wandb version 0.24.1
+[36m(TaskRunner pid=896026)[0m wandb: Run data is saved locally in /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_130329-f14af5tw
+[36m(TaskRunner pid=896026)[0m wandb: Run `wandb offline` to turn off syncing.
+[36m(TaskRunner pid=896026)[0m wandb: Syncing run revived-frost-5
+[36m(TaskRunner pid=896026)[0m wandb: ⭐️ View project at https://wandb.ai/shahidulshakib034-khulna-university-of-engineering-techn/uncategorized
+[36m(TaskRunner pid=896026)[0m wandb: 🚀 View run at https://wandb.ai/shahidulshakib034-khulna-university-of-engineering-techn/uncategorized/runs/f14af5tw
+[36m(TaskRunner pid=896026)[0m wandb: Detected [openai] in use.
+[36m(TaskRunner pid=896026)[0m wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+[36m(TaskRunner pid=896026)[0m wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[36m(pid=907698)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.[32m [repeated 7x across cluster][0m
+[36m(pid=907698)[0m   import pynvml  # type: ignore[import][32m [repeated 7x across cluster][0m
+[36m(TaskRunner pid=896026)[0m Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt
+[36m(TaskRunner pid=896026)[0m Training from scratch
+[36m(WorkerDict pid=897656)[0m WARNING 02-07 13:03:44 [api_server.py:1213] LoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!
+[36m(WorkerDict pid=897656)[0m INFO:2026-02-07 13:03:48,410:update_weights done, time cost: 2.33s
+[36m(TaskRunner pid=896026)[0m test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0}
+[36m(WorkerDict pid=897657)[0m WARNING 02-07 13:03:44 [api_server.py:1213] LoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!
+[36m(AgentLoopWorker pid=907700)[0m WARNING 02-07 13:05:46 [api_server.py:1213] LoRA dynamic loading & unloading is enabled in the API server. This should ONLY be used for local development!
+[36m(AgentLoopWorker pid=907700)[0m Using dataset class: RLHFDataset
+[36m(AgentLoopWorker pid=907702)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing
+[36m(AgentLoopWorker pid=907702)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)
+[36m(AgentLoopWorker pid=907702)[0m You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
+[36m(AgentLoopWorker pid=907696)[0m /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/tokenizer.py:109: UserWarning: Failed to create processor: Unsupported processor type: Qwen2TokenizerFast. This may affect multimodal processing[32m [repeated 7x across cluster][0m
+[36m(AgentLoopWorker pid=907696)[0m   warnings.warn(f"Failed to create processor: {e}. This may affect multimodal processing", stacklevel=1)[32m [repeated 7x across cluster][0m
+[36m(pid=912712)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+[36m(pid=912712)[0m   import pynvml  # type: ignore[import]
+[36m(AgentLoopWorker pid=907696)[0m You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.[32m [repeated 7x across cluster][0m
+[36m(pid=913253)[0m /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.[32m [repeated 6x across cluster][0m
+[36m(pid=913253)[0m   import pynvml  # type: ignore[import][32m [repeated 6x across cluster][0m