diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/08-03-02/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/11-18-25/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/11-27-06/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/11-29-26/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/12-06-58/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/21-08-13/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/21-14-31/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/21-36-33/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-08-56/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-39-41/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-39-41/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-41-59/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-41-59/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-57-12/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-57-12/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..599a58a76524a82196375a89e9799deffb783d12
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/hydra.yaml
@@ -0,0 +1,206 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=512
+    - data.max_response_length=768
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-15-37/.hydra/overrides.yaml
@@ -0,0 +1,39 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-25-32/.hydra/overrides.yaml
@@ -0,0 +1,39 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..98a95643027e6e5d887350f3fccc85b2b641c1f3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/hydra.yaml
@@ -0,0 +1,206 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=512
+    - data.max_response_length=768
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-28-20/.hydra/overrides.yaml
@@ -0,0 +1,39 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0711f3205e7405a13d0858ade535b3876b220f59
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/hydra.yaml
@@ -0,0 +1,206 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=512
+    - data.max_response_length=768
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-41-23/.hydra/overrides.yaml
@@ -0,0 +1,39 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88e37e42277bb0281f7da13d66247969dbd034ff
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/hydra.yaml
@@ -0,0 +1,206 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=512
+    - data.max_response_length=768
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/00-55-56/.hydra/overrides.yaml
@@ -0,0 +1,39 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e3a307eca5af05c5880497923cb12c15480ff0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28c4b07fe714adcc30149fab377385e364c43241
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/hydra.yaml
@@ -0,0 +1,206 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=512
+    - data.max_response_length=768
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328c19a30fd29a957a0c16b4eee58674d519d5a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-04-57/.hydra/overrides.yaml
@@ -0,0 +1,39 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..441b2dd2afa248d401a083ced38f027dfd127e46
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 768
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6163aa3d72edcfbbfc0f1a43ffb904357d97cb39
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/hydra.yaml
@@ -0,0 +1,207 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=512
+    - data.max_response_length=768
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=512,data.max_response_length=768,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..811e1d8b3d29e0b89d31c92531bf81ed525d9a0f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/01-53-27/.hydra/overrides.yaml
@@ -0,0 +1,40 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=512
+- data.max_response_length=768
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d937ef596c29a47778876a523029e45331da7fa
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-02/09-24-01/.hydra/config.yaml
@@ -0,0 +1,648 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c47d6b2f2d6efce57a40efef1ce4a49a555b9fe
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-27-48/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..829134df85779cad5d4138968c1f8e9b0476bf65
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/10-53-43/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4bdcd68b3620e08120db72f9637be2ff8ed0f428
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=8
+    - data.max_prompt_length=256
+    - data.max_response_length=256
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=4
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=100
+    - trainer.test_freq=1
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=256,data.max_response_length=256,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f06a900c99d8943ea4b7f1bfea5bcc16af966d1f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=8
+- data.max_prompt_length=256
+- data.max_response_length=256
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=4
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=1
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-09-07/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ec1d5a2d765e8d567d6564b8f708bd7a69d8f11
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 4
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 2
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 8
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 100
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b8eb11626ebc1e77d39ac0e2288c470fdee0d24
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=8
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=4
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=100
+    - trainer.test_freq=1
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..212dc359485a09e69637d0969ef12e6884751a18
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=8
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=4
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=1
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-11-49/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ec1d5a2d765e8d567d6564b8f708bd7a69d8f11
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 4
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 2
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 8
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 100
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bdc84d2a862fb28d60e4edbaceef18d23f39ee6
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=8
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=4
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=100
+    - trainer.test_freq=1
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..212dc359485a09e69637d0969ef12e6884751a18
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=8
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=4
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=1
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/11-23-19/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ec1d5a2d765e8d567d6564b8f708bd7a69d8f11
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 4
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 2
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 8
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 100
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea2ce354f91d73e35f331e0e208f0ec73547efde
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=8
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=4
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=100
+    - trainer.test_freq=1
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..212dc359485a09e69637d0969ef12e6884751a18
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=8
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=4
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=1
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-20-12/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..053ad4256269557f4e9948f7e44454bf42e28517
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 4
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 2
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 8
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: ''
+  experiment_name: ''
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 100
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93cbb6436df074d32489293796f07bc3d1f55115
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=8
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=4
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=
+    - trainer.experiment_name=
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=100
+    - trainer.test_freq=1
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a5e920e50bb4557935d2d4e7be1f8ac80d46241
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=8
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=4
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=
+- trainer.experiment_name=
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=1
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-49-57/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..053ad4256269557f4e9948f7e44454bf42e28517
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 4
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 2
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 2
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 8
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: ''
+  experiment_name: ''
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 100
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e72064b9dffaf1b54c89b4d73f2eef5ad170ca7c
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=8
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=4
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=
+    - trainer.experiment_name=
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=100
+    - trainer.test_freq=1
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2,actor_rollout_ref.actor.ppo_mini_batch_size=4,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=8,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.resume_mode=auto,trainer.save_freq=100,trainer.test_freq=1,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a5e920e50bb4557935d2d4e7be1f8ac80d46241
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=8
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=4
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=
+- trainer.experiment_name=
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=1
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/12-56-28/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc11d1bbff71d2857efbf1edef08a81d573a50f5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-15-29/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-25-00/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-25-00/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17e2b6d7bdbee28362cf3ce481317dc0b5c563a1
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-31-20/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-07/13-35-01/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-07/13-35-01/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9c4a6e8292582158ff32826d10566cd72c3a6b9
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-25-11/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f7b60e241a232606009053fee7a9d12d876d1ae
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-27-02/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e87cb56b201a32ef2cb619c3f186eecb9e96de1f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 20
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 5
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd7a40a04ba1ff9603f5f0e84fe00a55f3796126
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=20
+    - trainer.test_freq=5
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7acc6cbceb9337194a0e9f67e46faf28c74fc7d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-40-54/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a4d73e1cdafc11d4b34715fd5846cb4651b4b5b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfff1d726a653988a9400013e9550e4f378045a8
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db8b3a32660528290cf5decb6a60a49f6f52151f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/13-42-07/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a4d73e1cdafc11d4b34715fd5846cb4651b4b5b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 32
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8886b384b143167ef57d3eaa008903f339532d10
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=False
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=False
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db8b3a32660528290cf5decb6a60a49f6f52151f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-09/23-59-31/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df91e2ddafa426bb1333e1c6947aca2f49cfc878
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: true
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 16
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.4
+    ignore_eos: false
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ce101a017b96527d4997b6a52bc615c492d1182
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=True
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    - actor_rollout_ref.rollout.enforce_eager=True
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=True
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1137a7e54696f00ce70c95dc4c89e9f69d11ded
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-14-39/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df91e2ddafa426bb1333e1c6947aca2f49cfc878
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: true
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 16
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.4
+    ignore_eos: false
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9adca99977e28d2d0dcbfa6012fc1d52661038d9
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=True
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    - actor_rollout_ref.rollout.enforce_eager=True
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=True
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1137a7e54696f00ce70c95dc4c89e9f69d11ded
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/01-19-52/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df91e2ddafa426bb1333e1c6947aca2f49cfc878
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: true
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 16
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.4
+    ignore_eos: false
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..877c20bdd3a014869b3408e4ba4ff8a77d573516
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=True
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    - actor_rollout_ref.rollout.enforce_eager=True
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=True
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1137a7e54696f00ce70c95dc4c89e9f69d11ded
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/02-36-06/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34450aec9ebe996139ae6513e150856e47ef8db0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: true
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 16
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.4
+    ignore_eos: false
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..73400d35e07cb1c4358345307bf9337ad16cdb1b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=True
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    - actor_rollout_ref.rollout.enforce_eager=True
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=True
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4e9fd6c7e73a162244a962347f408997e5aa2b2
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-33-25/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34450aec9ebe996139ae6513e150856e47ef8db0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: true
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: 16
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.2
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: true
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: true
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: vllm
+    mode: async
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.4
+    ignore_eos: false
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 1
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: 32
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 3
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+    val_kwargs:
+      _target_: verl.workers.config.SamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip_rollout: false
+    skip_dump_dir: /tmp/rollout_dump
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    layered_summon: false
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: Qwen/Qwen3-4B-Instruct-2507
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+  val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 1024
+  max_response_length: 2048
+  train_batch_size: 512
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: true
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+reward_manager:
+  _target_: verl.trainer.config.config.RewardManagerConfig
+  source: register
+  name: ${oc.select:reward_model.reward_manager,naive}
+  module:
+    _target_: verl.trainer.config.config.ModuleConfig
+    path: null
+    name: custom_reward_manager
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    warmup_style: null
+    override_optimizer_config: null
+  model:
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+    override_config: {}
+    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+    _target_: verl.workers.config.FSDPCriticModelCfg
+    use_shm: false
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: false
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+reward_model:
+  enable: false
+  enable_resource_pool: false
+  n_gpus_per_node: 8
+  nnodes: 0
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: false
+    override_config: {}
+    use_shm: false
+    use_remove_padding: false
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+  micro_batch_size: null
+  micro_batch_size_per_gpu: null
+  max_length: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+  reward_loop_source: register
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  launch_reward_fn_async: false
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+  ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  norm_adv_by_std_in_grpo: true
+  use_kl_in_reward: false
+  kl_penalty: kl
+  kl_ctrl:
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: false
+  pf_ppo:
+    reweight_method: pow
+    weight_pow: 2.0
+custom_reward_function:
+  path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py
+  name: compute_score
+trainer:
+  balance_batch: true
+  total_epochs: 15
+  total_training_steps: null
+  project_name: readctrl-verl
+  experiment_name: qwen3-4b-instruct-en
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 5
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: 10
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+  max_actor_ckpt_to_keep: 1
+  max_critic_ckpt_to_keep: 1
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+  remove_previous_ckpt_in_save: true
+global_profiler:
+  _target_: verl.utils.profiler.ProfilerConfig
+  tool: null
+  steps: null
+  profile_continuous_steps: false
+  save_path: outputs/profile
+  global_tool_config:
+    nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
+      discrete: false
+      controller_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+      worker_nsight_options:
+        trace: cuda,nvtx,cublas,ucx
+        cuda-memory-usage: 'true'
+        cuda-graph-trace: graph
+        capture-range: cudaProfilerApi
+        capture-range-end: null
+        kill: none
+    torch_memory:
+      trace_alloc_max_entries: 100000
+      stack_depth: 32
+      context: all
+      stacks: all
+      kw_args: {}
+transfer_queue:
+  enable: false
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1cbe8c3e8fef63358dbcc5754163f8966f31b1d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=True
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    - actor_rollout_ref.rollout.enforce_eager=True
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=True
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4e9fd6c7e73a162244a962347f408997e5aa2b2
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-13/21-51-09/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/__init__.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd1e8433dffa0b3ba420be3e346f4f5cd062014
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_gpu.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4a959b20f525c2d38248c56e3b3c57fc823b66
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_gpu.py
@@ -0,0 +1,139 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+import ray
+
+from tests.checkpoint_engine.test_utils import create_rollout_worker_group, create_trainer_worker_group
+from verl.checkpoint_engine import CheckpointEngineManager
+from verl.single_controller.ray.base import (
+    RayResourcePool,
+    split_resource_pool,
+)
+from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("rebuild_group", [False, True])
+@pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+async def test_nccl_checkpoint_engine(
+    rebuild_group,
+    num_trainer,
+    num_rollout,
+    num_nodes=1,
+    num_gpus_per_node=8,
+    check_allclose=True,
+    model_path="~/models/Qwen/Qwen3-8B-Base",
+):
+    model_path = os.path.expanduser(model_path)
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "UCX_TLS": "rc,tcp,cuda",
+                "UCX_MAX_RNDV_RAILS": "4",
+                "UCX_LOG_LEVEL": "INFO",
+                "VERL_LOGGING_LEVEL": "DEBUG",
+            }
+        }
+    )
+
+    # initialize config
+    checkpoint_engine_config = CheckpointEngineConfig(
+        backend="nccl", engine_kwargs={"nccl": {"rebuild_group": rebuild_group}}
+    )
+    model_config = HFModelConfig(path=model_path, use_remove_padding=True)
+    rollout_config = RolloutConfig(name="vllm", checkpoint_engine=checkpoint_engine_config)
+
+    # create trainer and rollout worker group
+    resource_pool = RayResourcePool(process_on_nodes=[num_gpus_per_node] * num_nodes, max_colocate_count=3)
+    trainer_pool, rollout_pool = split_resource_pool(resource_pool, [num_trainer, num_rollout])
+    trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config)
+    trainer.reset()
+    rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose)
+
+    # create checkpoint engine manager
+    checkpoint_manager = CheckpointEngineManager(backend="nccl", trainer=trainer, replicas=replicas)
+    for _ in range(3):
+        await checkpoint_manager.update_weights()
+        rollout.check_weights()
+
+    ray.shutdown()
+
+
+@pytest.mark.skip(reason="temporary skip since our ci environment is not ready")
+@pytest.mark.asyncio
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+@pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+async def test_nixl_checkpoint_engine(
+    num_trainer,
+    num_rollout,
+    device,
+    num_nodes=1,
+    num_gpus_per_node=8,
+    check_allclose=True,
+    model_path="~/models/Qwen/Qwen3-8B-Base",
+):
+    model_path = os.path.expanduser(model_path)
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                # TODO: it's pretty hard to set these environment variables right, please consult
+                # with your network admin. Maybe auto adjust UCX_* according to NCCL_IB_*?
+                "UCX_TLS": "rc,ud,cuda",
+                # "UCX_IB_GID_INDEX": "3", # NCCL_IB_GID_INDEX
+                # "UCX_IB_DEVICES": "mlx5_1:1,mlx5_2:1,mlx5_3:1", # NCCL_IB_HCA
+                "UCX_RC_TIMEOUT": "30s",  # NCCL_IB_TIMEOUT
+                "UCX_RC_RETRY_COUNT": "7",  # NCCL_IB_RETRY_COUNT
+                "UCX_KEEPALIVE_INTERVAL": "1s",
+                "UCX_KEEPALIVE_NUM_EPS": "10",
+                "UCX_MAX_RNDV_RAILS": "4",
+                "UCX_IB_ROCE_REACHABILITY_MODE": "all",
+                "UCX_LOG_LEVEL": "INFO",
+                "VERL_LOGGING_LEVEL": "DEBUG",
+            }
+        }
+    )
+
+    # initialize config
+    checkpoint_engine_config = CheckpointEngineConfig(backend="nixl", engine_kwargs={"nixl": {"device": device}})
+    model_config = HFModelConfig(path=model_path, use_remove_padding=True)
+    rollout_config = RolloutConfig(name="vllm", checkpoint_engine=checkpoint_engine_config)
+
+    # create trainer and rollout worker group
+    resource_pool = RayResourcePool(process_on_nodes=[num_gpus_per_node] * num_nodes, max_colocate_count=3)
+    trainer_pool, rollout_pool = split_resource_pool(resource_pool, [num_trainer, num_rollout])
+    trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config)
+    trainer.reset()
+    rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose)
+
+    # create checkpoint engine manager
+    checkpoint_manager = CheckpointEngineManager(backend="nixl", trainer=trainer, replicas=replicas)
+    for _ in range(3):
+        await checkpoint_manager.update_weights()
+        rollout.check_weights()
+
+    ray.shutdown()
+
+
+if __name__ == "__main__":
+    test_nccl_checkpoint_engine(
+        rebuild_group=False,
+        num_trainer=2,
+        num_rollout=30,
+        num_nodes=4,
+        num_gpus_per_node=8,
+        check_allclose=False,
+        model_path=os.environ["HDFS_ROOT"] + "/model/Qwen3-30B-A3B-Base",
+    )
diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_npu.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99fcc771bef4dca4eb13b836b436539fbb55172
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_correctness_on_npu.py
@@ -0,0 +1,86 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+import ray
+
+from tests.checkpoint_engine.test_utils import create_rollout_worker_group, create_trainer_worker_group
+from verl.checkpoint_engine import CheckpointEngineManager
+from verl.single_controller.ray.base import (
+    RayResourcePool,
+    split_resource_pool,
+)
+from verl.utils.device import get_device_name
+from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("rebuild_group", [False])
+@pytest.mark.parametrize("num_trainer, num_rollout", [(2, 6)])
+async def test_hccl_checkpoint_engine(
+    rebuild_group,
+    num_trainer,
+    num_rollout,
+    num_nodes=1,
+    num_gpus_per_node=8,
+    check_allclose=True,
+    model_path="~/models/Qwen/Qwen3-8B-Base",
+):
+    model_path = os.path.expanduser(model_path)
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "HCCL_CONNECT_TIMEOUT": "1500",
+                "HCCL_HOST_SOCKET_PORT_RANGE": "60000-60050",
+                "HCCL_NPU_SOCKET_PORT_RANGE": "61000-61050",
+                "VERL_LOGGING_LEVEL": "DEBUG",
+            }
+        }
+    )
+
+    # initialize config
+    checkpoint_engine_config = CheckpointEngineConfig(
+        backend="hccl", engine_kwargs={"hccl": {"rebuild_group": rebuild_group}}
+    )
+    model_config = HFModelConfig(path=model_path, use_remove_padding=True)
+    rollout_config = RolloutConfig(name="vllm", checkpoint_engine=checkpoint_engine_config)
+
+    # create trainer and rollout worker group
+    resource_pool = RayResourcePool(process_on_nodes=[num_gpus_per_node] * num_nodes, max_colocate_count=3)
+    resource_pool.get_placement_groups(device_name=get_device_name())
+    trainer_pool, rollout_pool = split_resource_pool(resource_pool, [num_trainer, num_rollout])
+    trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config)
+    trainer.reset()
+    rollout, replicas = await create_rollout_worker_group(rollout_pool, model_config, rollout_config, check_allclose)
+
+    # create checkpoint engine manager
+    checkpoint_manager = CheckpointEngineManager(backend="hccl", trainer=trainer, replicas=replicas)
+    for _ in range(3):
+        await checkpoint_manager.update_weights()
+        rollout.check_weights()
+
+    ray.shutdown()
+
+
+if __name__ == "__main__":
+    test_hccl_checkpoint_engine(
+        rebuild_group=False,
+        num_trainer=2,
+        num_rollout=6,
+        num_nodes=1,
+        num_gpus_per_node=8,
+        check_allclose=False,
+        model_path=os.environ["HDFS_ROOT"] + "/model/Qwen3-30B-A3B-Base",
+    )
diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_special_server_adapter.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_special_server_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..193a9eaeb56035752bf82381770af1ecf63098a6
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_special_server_adapter.py
@@ -0,0 +1,121 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import os
+
+import pytest
+import ray
+from omegaconf import DictConfig
+from openai import AsyncOpenAI
+
+from tests.checkpoint_engine.test_utils import create_trainer_worker_group
+from verl.checkpoint_engine import CheckpointEngineManager, CheckpointEngineWorker
+from verl.single_controller.ray import (
+    RayClassWithInitArgs,
+    RayResourcePool,
+    RayWorkerGroup,
+)
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.device import get_device_name
+from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig
+from verl.workers.rollout.replica import get_rollout_replica_class
+
+
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+
+    config.trainer.n_gpus_per_node = 8
+    config.trainer.nnodes = 1
+    config.actor_rollout_ref.model.path = os.path.expanduser("~/models/Qwen/Qwen3-VL-2B-Instruct")
+    config.actor_rollout_ref.rollout.name = os.environ["ROLLOUT_NAME"]
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = False
+    config.actor_rollout_ref.rollout.max_num_seqs = 256
+    config.actor_rollout_ref.rollout.checkpoint_engine.backend = "nccl" if get_device_name() == "cuda" else "hccl"
+
+    return config
+
+
+@pytest.mark.asyncio
+async def test_server_adapter(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+                "VLLM_DISABLE_COMPILE_CACHE": "1",
+            }
+        }
+    )
+
+    # 1. create trainer worker group
+    model_config: HFModelConfig = omega_conf_to_dataclass(init_config.actor_rollout_ref.model)
+    checkpoint_engine_config: CheckpointEngineConfig = omega_conf_to_dataclass(
+        init_config.actor_rollout_ref.rollout.checkpoint_engine
+    )
+    trainer_pool = RayResourcePool(process_on_nodes=[4], max_colocate_count=3)
+    trainer = create_trainer_worker_group(trainer_pool, model_config, checkpoint_engine_config)
+    trainer.reset()
+
+    # 2. create rollout replicas
+    rollout_config: RolloutConfig = omega_conf_to_dataclass(init_config.actor_rollout_ref.rollout)
+
+    # 2.1 create checkpoint engine worker group
+    rollout_pool = RayResourcePool(process_on_nodes=[4], max_colocate_count=3)
+    ray_cls_with_init = RayClassWithInitArgs(
+        cls=ray.remote(CheckpointEngineWorker),
+        model_config=model_config,
+        rollout_config=rollout_config,
+    )
+    rollout = RayWorkerGroup(
+        resource_pool=rollout_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name()
+    )
+
+    # 2.2 create rollout replicas
+    rollout_replica_class = get_rollout_replica_class(rollout_config.name)
+    rollout_replicas = [
+        rollout_replica_class(
+            replica_rank=replica_rank,
+            config=rollout_config,
+            model_config=model_config,
+        )
+        for replica_rank in range(2)
+    ]
+    await asyncio.gather(*[replica.init_hybrid(rollout) for replica in rollout_replicas])
+
+    # 3. create checkpoint engine manager
+    checkpoint_manager = CheckpointEngineManager(
+        backend=checkpoint_engine_config.backend, trainer=trainer, replicas=rollout_replicas
+    )
+    for i in range(3):
+        await checkpoint_manager.update_weights()
+
+        server_addresses = rollout_replicas[i % len(rollout_replicas)].server_address
+        client = AsyncOpenAI(
+            api_key="123-abc",
+            base_url=f"http://{server_addresses}/v1",
+        )
+
+        completion = await client.chat.completions.create(
+            model=init_config.actor_rollout_ref.model.path,
+            messages=[{"role": "user", "content": "What can you do?"}],
+        )
+        print("[OUTPUT]:", completion.choices[0].message.content)
+
+    ray.shutdown()
diff --git a/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_utils.py b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e3c8f1031df0578fb7459a33d785ff8b2dbdf5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/checkpoint_engine/test_utils.py
@@ -0,0 +1,179 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+from typing import Generator
+
+import ray
+import torch
+from transformers import AutoModelForCausalLM
+
+from verl.checkpoint_engine import CheckpointEngineRegistry, CheckpointEngineWorker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils.device import get_device_name
+from verl.utils.fs import copy_to_local
+from verl.workers.config import CheckpointEngineConfig, FSDPEngineConfig, HFModelConfig, RolloutConfig
+from verl.workers.engine_workers import TrainingWorker, TrainingWorkerConfig
+from verl.workers.rollout import BaseRollout, RolloutReplica
+
+
+class TrainingWorkerTest(TrainingWorker):
+    def __init__(self, config: TrainingWorkerConfig, checkpoint_engine_config: CheckpointEngineConfig) -> None:
+        super().__init__(config)
+        backend = checkpoint_engine_config.backend
+        bucket_size = checkpoint_engine_config.update_weights_bucket_megabytes << 20
+        engine_kwargs = checkpoint_engine_config.engine_kwargs.get(backend, {})
+        self.checkpoint_engine = CheckpointEngineRegistry.new(
+            backend, is_master=(torch.distributed.get_rank() == 0), bucket_size=bucket_size, **engine_kwargs
+        )
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+    async def update_weights(self):
+        per_tensor_param, _ = self.engine.get_per_tensor_param()
+        await self.checkpoint_engine.send_weights(per_tensor_param)
+
+    @register(dispatch_mode=Dispatch.DP_COMPUTE, blocking=False)
+    def execute_checkpoint_engine(self, method: str, *args, **kwargs):
+        return getattr(self.checkpoint_engine, method)(*args, **kwargs)
+
+
+class MockServerAdapter(BaseRollout):
+    def __init__(self, config: RolloutConfig, model_config: HFModelConfig, check_allclose: bool = True):
+        super().__init__(config, model_config, device_mesh=None)
+        self.check_allclose = check_allclose
+        self.model = None
+        self.received_weights: dict[str, torch.Tensor] = {}
+
+    async def resume(self, tags: list[str]):
+        raise NotImplementedError()
+
+    async def release(self):
+        raise NotImplementedError()
+
+    async def update_weights(
+        self,
+        weights: Generator[tuple[str, torch.Tensor], None, None],
+        **kwargs,
+    ):
+        async for name, weight in weights:
+            weight = weight.clone()
+            if self.check_allclose:
+                self.received_weights[name] = weight.clone()
+
+    def check_weights(self):
+        if not self.check_allclose:
+            return
+
+        if self.model is None:
+            local_path = copy_to_local(self.model_config.path)
+            self.model = AutoModelForCausalLM.from_pretrained(local_path, torch_dtype=torch.bfloat16, device_map="cpu")
+
+        for name, weight in self.model.state_dict().items():
+            assert name in self.received_weights, f"weight {name} not received"
+            received = self.received_weights[name]
+            assert torch.allclose(weight.to(received.device), received), f"weight {name} not equal"
+        self.received_weights.clear()
+
+
+class MockReplica(RolloutReplica):
+    async def init_hybrid(self, worker_group: RayWorkerGroup):
+        """Init hybrid rollout server, rollout engine and training engine(fsdp/megatron) fused in same process.
+
+        Args:
+            worker_group: RayWorkerGroup, fused workers where training engine(fsdp/megatron) have been initialized.
+        """
+        self.workers = worker_group.workers[
+            self.world_size * self.replica_rank : self.world_size * (self.replica_rank + 1)
+        ]
+
+    def get_ray_class_with_init_args(self) -> RayClassWithInitArgs:
+        """Get rollout worker actor class for colocated and standalone mode."""
+        raise NotImplementedError
+
+    async def launch_servers(self):
+        """Launch http server in each node."""
+        raise NotImplementedError
+
+
+class CheckpointEngineWorkerTest(CheckpointEngineWorker):
+    def __init__(self, rollout_config: RolloutConfig, model_config: HFModelConfig, check_allclose: bool = True) -> None:
+        server_adapter = MockServerAdapter(rollout_config, model_config, check_allclose)
+        super().__init__(rollout_config, model_config, server_adapter)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def check_weights(self):
+        self.server_adapter.check_weights()
+
+
+def create_trainer_worker_group(
+    resource_pool: RayResourcePool, model_config: HFModelConfig, checkpoint_engine_config: CheckpointEngineConfig
+) -> RayWorkerGroup:
+    engine_config = FSDPEngineConfig(forward_only=True, fsdp_size=resource_pool.world_size, strategy="fsdp")
+    trainer_config = TrainingWorkerConfig(
+        model_type="language_model",
+        model_config=model_config,
+        engine_config=engine_config,
+    )
+
+    ray_cls_with_init = RayClassWithInitArgs(
+        cls=ray.remote(TrainingWorkerTest),
+        config=trainer_config,
+        checkpoint_engine_config=checkpoint_engine_config,
+    )
+    ray_cls_with_init.update_options(
+        {
+            "runtime_env": {
+                "env_vars": {
+                    "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
+                }
+            }
+        }
+    )
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name())
+    return wg
+
+
+async def create_rollout_worker_group(
+    resource_pool: RayResourcePool,
+    model_config: HFModelConfig,
+    rollout_config: RolloutConfig,
+    check_allclose: bool = True,
+) -> tuple[RayWorkerGroup, list[MockReplica]]:
+    # create rollout worker group
+    ray_cls_with_init = RayClassWithInitArgs(
+        cls=ray.remote(CheckpointEngineWorkerTest),
+        model_config=model_config,
+        rollout_config=rollout_config,
+        check_allclose=check_allclose,
+    )
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name())
+
+    # create rollout replicas
+    rollout_world_size = (
+        rollout_config.tensor_model_parallel_size
+        * rollout_config.data_parallel_size
+        * rollout_config.pipeline_model_parallel_size
+    )
+    num_replicas = wg.world_size // rollout_world_size
+    replicas = []
+    for replica_rank in range(num_replicas):
+        replica = MockReplica(
+            replica_rank=replica_rank,
+            config=rollout_config,
+            model_config=model_config,
+        )
+        replicas.append(replica)
+    await asyncio.gather(*[replica.init_hybrid(wg) for replica in replicas])
+
+    return wg, replicas
diff --git a/code/RL_model/verl/verl_train/tests/models/test_engine.py b/code/RL_model/verl/verl_train/tests/models/test_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..9878ece4d067da42c14ead4c5af46b992fc561e7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/models/test_engine.py
@@ -0,0 +1,442 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+os.environ["NCCL_DEBUG"] = "WARN"
+
+from functools import partial
+
+import numpy as np
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    Qwen3Config,
+    Qwen3MoeConfig,
+)
+
+from verl import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.trainer.config import CheckpointConfig
+from verl.utils import tensordict_utils as tu
+from verl.utils.model import compute_position_id_with_mask, create_random_mask
+from verl.utils.torch_functional import logprobs_from_logits_naive
+from verl.workers.config import (
+    ActorConfig,
+    CriticConfig,
+    FSDPEngineConfig,
+    FSDPOptimizerConfig,
+    HFModelConfig,
+    McoreEngineConfig,
+    McoreOptimizerConfig,
+)
+from verl.workers.engine_workers import TrainingWorker, TrainingWorkerConfig
+from verl.workers.utils.losses import ppo_loss, sft_loss, value_loss
+from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding
+
+
+def get_test_language_model(device_count):
+    if device_count == 1:
+        model = "~/models/HuggingFaceTB/SmolLM2-135M-Instruct"
+    else:
+        model = "~/models/Qwen/Qwen2.5-0.5B"
+    model = os.path.expanduser(model)
+    return model
+
+
+def create_training_config(model_type, strategy, device_count, model):
+    if device_count == 1:
+        tp = pp = cp = fsdp_size = 1
+    else:
+        tp = pp = cp = 2
+        fsdp_size = 4
+
+    path = os.path.expanduser(model)
+    model_config = HFModelConfig(path=path, use_remove_padding=True)
+
+    kwargs = dict(
+        param_offload=True,
+        optimizer_offload=True,
+        grad_offload=True,
+        use_dynamic_bsz=True,
+        use_remove_padding=True,
+        max_token_len_per_gpu=500,
+        infer_max_token_len_per_gpu=1000,
+    )
+
+    if strategy == "megatron":
+        engine_config = McoreEngineConfig(
+            forward_only=False,
+            use_mbridge=True,
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            context_parallel_size=cp,
+            **kwargs,
+        )
+        optimizer_config = McoreOptimizerConfig(lr_decay_steps=10)
+    elif strategy in ["fsdp", "fsdp2"]:
+        engine_config = FSDPEngineConfig(
+            forward_only=False, fsdp_size=fsdp_size, strategy=strategy, ulysses_sequence_parallel_size=cp, **kwargs
+        )
+        optimizer_config = FSDPOptimizerConfig()
+    else:
+        raise NotImplementedError(f"strategy {strategy} is not supported")
+
+    config = TrainingWorkerConfig(
+        model_type=model_type,
+        model_config=model_config,
+        engine_config=engine_config,
+        optimizer_config=optimizer_config,
+        checkpoint_config=None,
+    )
+    return config
+
+
+@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2", "megatron"])
+def test_actor_engine(strategy):
+    ray.init()
+    device_count = torch.cuda.device_count()
+    config = create_training_config(
+        model_type="language_model",
+        strategy=strategy,
+        device_count=device_count,
+        model=get_test_language_model(device_count),
+    )
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config)
+    resource_pool = RayResourcePool(process_on_nodes=[device_count])
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+    # init model
+    wg.reset()
+
+    sft_loss_ = partial(sft_loss, config=config)
+
+    wg.set_loss_fn(sft_loss_)
+
+    batch_size = 8
+    seqlen = 32
+
+    response_length = seqlen // 2
+
+    torch.manual_seed(1)
+    np.random.seed(1)
+
+    input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen))
+    attention_mask = create_random_mask(
+        input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6
+    )
+    position_ids = compute_position_id_with_mask(attention_mask)
+
+    global_token_num = torch.sum(attention_mask, dim=-1).tolist()
+
+    print(input_ids.float().mean(), attention_mask.float().mean())
+
+    responses = input_ids[:, response_length:]
+    response_mask = attention_mask[:, response_length:]
+
+    assert torch.all(response_mask[:, 0] == 1)
+
+    data = DataProto.from_single_dict(
+        {
+            "input_ids": input_ids,
+            "prompts": input_ids[:, :response_length],
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "responses": responses,
+            "response_mask": response_mask,
+        },
+        meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False},
+    )
+
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
+    # eval
+    output = wg.infer_batch(data_td)
+    output = output.get()
+    logprobs_unpad = tu.get(output, "log_probs").cpu()
+    logprobs = no_padding_2_padding(logprobs_unpad, data_td)
+
+    output = DataProto.from_single_dict({"old_log_probs": logprobs})
+
+    # load hf model and compare results with hf model
+    path = config.model_config.path
+    hf_model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16)
+    hf_output = hf_model(input_ids, attention_mask=attention_mask)
+    hf_logprobs = logprobs_from_logits_naive(
+        hf_output.logits[:, -response_length - 1 : -1, :].float(), input_ids[:, -response_length:]
+    )
+    hf_logprobs_mean = torch.mean(hf_logprobs * response_mask)
+    mcore_logprobs_mean = torch.mean(output.batch["old_log_probs"] * response_mask)
+
+    torch.testing.assert_close(hf_logprobs_mean, mcore_logprobs_mean, atol=1e-3, rtol=1e-2)
+
+    data = data.union(output)
+
+    # TODO: sft_loss_ is not compatible with ActorWorker until we replace DataProto with torch.jagged TensorDict
+    # wg.set_loss_fn(sft_loss_)
+
+    # train for one step
+    # metrics = wg.update_actor(data)
+    # print(metrics)
+
+    # add ppo data
+    data.batch["advantages"] = torch.rand_like(responses, dtype=torch.float32)
+    data.batch["ref_log_prob"] = torch.rand_like(responses, dtype=torch.float32)
+
+    # construct actor config
+    actor_config = ActorConfig(strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1)
+
+    # set ppo loss
+    ppo_loss_ = partial(ppo_loss, config=actor_config)
+    wg.set_loss_fn(ppo_loss_)
+
+    # update again
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
+    # auto load/offload
+    tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0])
+    ppo_metrics = wg.train_batch(data_td)
+    ppo_metrics = ppo_metrics.get()
+    ppo_metrics = tu.get(ppo_metrics, "metrics")
+    print(ppo_metrics)
+
+    # test manual load/offload
+    tu.assign_non_tensor(data_td, disable_auto_offload=True)
+    wg.to("device")
+    ppo_metrics = wg.train_batch(data_td)
+    ppo_metrics = ppo_metrics.get()
+    ppo_metrics = tu.get(ppo_metrics, "metrics")
+    print(ppo_metrics)
+    wg.to("cpu")
+
+    ray.shutdown()
+
+
+def create_value_model(language_model_path, output_path):
+    config = AutoConfig.from_pretrained(language_model_path)
+    config.num_labels = 1
+    config.classifier_dropout = 0
+    config.tie_word_embeddings = False
+    model = AutoModelForTokenClassification.from_config(config)
+    tokenizer = AutoTokenizer.from_pretrained(os.path.expanduser(language_model_path))
+    assert model.config.num_labels == 1
+    path = os.path.expanduser(output_path)
+    model.save_pretrained(path)
+    tokenizer.save_pretrained(path)
+    config.save_pretrained(path)
+    return path
+
+
+@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2"])
+def test_critic_engine(strategy):
+    device_count = torch.cuda.device_count()
+    value_model_path = os.path.expanduser("~/models/test_model")
+    language_model_path = get_test_language_model(device_count=device_count)
+    create_value_model(language_model_path, value_model_path)
+
+    torch.manual_seed(1)
+    np.random.seed(1)
+
+    ray.init()
+
+    config = create_training_config(
+        model_type="value_model", strategy=strategy, device_count=device_count, model=value_model_path
+    )
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config)
+    resource_pool = RayResourcePool(process_on_nodes=[device_count])
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+    # init model
+    wg.reset()
+
+    batch_size = 8
+    seqlen = 32
+
+    response_length = seqlen // 2
+    input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen))
+    attention_mask = create_random_mask(
+        input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6
+    )
+    position_ids = compute_position_id_with_mask(attention_mask)
+
+    global_token_num = torch.sum(attention_mask, dim=-1).tolist()
+
+    print(input_ids.float().mean(), attention_mask.float().mean())
+
+    responses = input_ids[:, response_length:]
+    response_mask = attention_mask[:, response_length:]
+
+    assert torch.all(response_mask[:, 0] == 1)
+
+    data = DataProto.from_single_dict(
+        {
+            "input_ids": input_ids,
+            "prompts": input_ids[:, :response_length],
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "responses": responses,
+            "response_mask": response_mask,
+        },
+        meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False},
+    )
+
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
+    # eval
+    output = wg.infer_batch(data_td)
+    output = output.get()
+
+    values_unpad = tu.get(output, "values").float().cpu()
+    values = no_padding_2_padding(values_unpad, data_td)
+
+    output = DataProto.from_single_dict({"values": values})
+
+    # load hf model and compare results with hf model
+    with torch.device("cuda"), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        hf_model = AutoModelForTokenClassification.from_pretrained(
+            value_model_path, torch_dtype=torch.float32, attn_implementation="flash_attention_2"
+        )
+        hf_output = hf_model(input_ids.cuda(), attention_mask=attention_mask.cuda())
+        hf_values = hf_output.logits[:, -response_length - 1 : -1, :].float().squeeze(-1).cpu()
+
+    hf_values_mean = torch.mean(hf_values * response_mask)
+    engine_values = torch.mean(output.batch["values"] * response_mask)
+
+    torch.testing.assert_close(hf_values_mean, engine_values, atol=1e-2, rtol=1e-2)
+
+    data = data.union(output)
+
+    # add ppo data
+    data.batch["returns"] = torch.rand_like(responses, dtype=torch.float32)
+
+    # update again
+    # create critic config
+    critic_config = CriticConfig(
+        strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1, model_config=config.model_config
+    )
+    value_loss_ = partial(value_loss, config=critic_config)
+    wg.set_loss_fn(value_loss_)
+
+    # update again
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
+    # auto load/offload
+    tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0])
+    ppo_metrics = wg.train_batch(data_td)
+    ppo_metrics = ppo_metrics.get()
+    ppo_metrics = tu.get(ppo_metrics, "metrics")
+    print(ppo_metrics)
+
+    ray.shutdown()
+
+
+def create_actor_model(tmp_path, config):
+    model = AutoModelForCausalLM.from_config(config)
+    path = os.path.join(tmp_path, "test_model")
+    model.save_pretrained(path)
+    config.save_pretrained(path)
+    return path
+
+
+def _worker(rank: int, world_size: int, rendezvous_file: str, strategy: str, model_path: str):
+    torch.cuda.set_device(rank)
+    dist.init_process_group(
+        backend="nccl",
+        init_method=f"file://{rendezvous_file}",
+        rank=rank,
+        world_size=world_size,
+    )
+
+    ref_model_config = AutoConfig.from_pretrained(model_path)
+    with torch.device("meta"):
+        ref_model = AutoModelForCausalLM.from_config(ref_model_config)
+
+    from verl.workers.engine import BaseEngine, EngineRegistry
+
+    # construct configs
+    model_config = HFModelConfig(path=model_path, load_tokenizer=False)
+
+    if strategy == "megatron":
+        engine_config = McoreEngineConfig(
+            forward_only=False,
+            use_mbridge=True,
+            tensor_model_parallel_size=2,
+            pipeline_model_parallel_size=2,
+            context_parallel_size=1,
+        )
+        optimizer_config = McoreOptimizerConfig(lr_decay_steps=10)
+    elif strategy in ["fsdp", "fsdp2"]:
+        engine_config = FSDPEngineConfig(
+            forward_only=False, fsdp_size=4, strategy=strategy, ulysses_sequence_parallel_size=2
+        )
+        optimizer_config = FSDPOptimizerConfig()
+    else:
+        raise NotImplementedError(f"strategy {strategy} is not supported")
+
+    checkpoint_config = CheckpointConfig()
+
+    # build model engine
+    engine: BaseEngine = EngineRegistry.new(
+        model_type="language_model",
+        backend=engine_config.strategy,
+        model_config=model_config,
+        engine_config=engine_config,
+        optimizer_config=optimizer_config,
+        checkpoint_config=checkpoint_config,
+    )
+
+    engine.initialize()
+
+    # get per tensor parameter
+    per_tensor_params, _ = engine.get_per_tensor_param()
+
+    ref_state_dict = ref_model.state_dict()
+
+    # load ground truth and compare
+    for key, value in per_tensor_params:
+        assert key in ref_state_dict, f"{key} not in ref_state_dict"
+        assert value.shape == ref_state_dict[key].shape, (
+            f"{key} shape not equal, {value.shape} != {ref_state_dict[key].shape}"
+        )
+        if rank == 0:
+            print(key, value.shape)
+
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+@pytest.mark.parametrize("world_size", [8])
+@pytest.mark.parametrize("config", [Qwen3Config(num_hidden_layers=2), Qwen3MoeConfig(num_hidden_layers=2)])
+@pytest.mark.parametrize("strategy", ["megatron", "fsdp", "fsdp2"])
+def test_per_tensor_generator(world_size, tmp_path, config, strategy):
+    rendezvous_file = str(tmp_path / "rdzv_mask")
+    os.makedirs(os.path.dirname(rendezvous_file), exist_ok=True)
+    # create a model
+    model_path = create_actor_model(tmp_path, config)
+    # spawn workers
+    mp.spawn(
+        fn=_worker,
+        args=(world_size, rendezvous_file, strategy, model_path),
+        nprocs=world_size,
+        join=True,
+    )
diff --git a/code/RL_model/verl/verl_train/tests/models/test_tiled_mlp_accuracy.py b/code/RL_model/verl/verl_train/tests/models/test_tiled_mlp_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b022243ffe4ba15724fcf2c89f91a92e0b1e37c
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/models/test_tiled_mlp_accuracy.py
@@ -0,0 +1,218 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test script to verify TiledMLP accuracy by comparing logits and gradients
+between regular MLP and TiledMLP under FSDP2.
+Run with: torchrun --nproc_per_node=2 tests/test_tiled_mlp_accuracy.py
+"""
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import fully_shard
+
+
+def setup_distributed():
+    dist.init_process_group(backend="nccl")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    torch.cuda.set_device(rank)
+    return rank, world_size
+
+
+def create_model(model_name="Qwen/Qwen3-1.7B", num_layers=2):
+    """Load a Qwen3-1.7B model with only 2 layers from pretrained weights."""
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    config.num_hidden_layers = num_layers
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        config=config,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation="flash_attention_2",
+    )
+    return model
+
+
+def apply_fsdp2(model, device_mesh):
+    """Apply FSDP2 sharding to model."""
+    for layer in model.model.layers:
+        fully_shard(layer, mesh=device_mesh)
+    fully_shard(model, mesh=device_mesh)
+    return model
+
+
+def run_forward_backward(model, input_ids, labels):
+    """Run forward and backward pass, return logits and gradients."""
+    model.zero_grad()
+
+    outputs = model(input_ids=input_ids, labels=labels)
+    logits = outputs.logits.clone().detach()
+    loss = outputs.loss
+
+    loss.backward()
+
+    # Collect MLP gradients
+    gradients = {}
+    for name, param in model.named_parameters():
+        if "mlp" in name and param.grad is not None:
+            gradients[name] = param.grad.clone().detach()
+
+    return logits, gradients, loss.item()
+
+
+def compare_results(logits1, grads1, logits2, grads2, rank):
+    """Compare logits and gradients between two runs."""
+    # Compare logits
+    logits_diff = (logits1 - logits2).abs()
+    logits_max_diff = logits_diff.max().item()
+    logits_mean_diff = logits_diff.mean().item()
+
+    # Compare gradients (only for params that exist on this rank due to FSDP sharding)
+    all_pass = True
+    grad_results = []
+    for name in sorted(grads1.keys()):
+        if name in grads2:
+            g1, g2 = grads1[name], grads2[name]
+            diff = (g1 - g2).abs()
+            max_diff = diff.max().item()
+            mean_diff = diff.mean().item()
+
+            # Check if within tolerance (1e-2 for bf16)
+            passed = max_diff < 1e-2
+            if not passed:
+                all_pass = False
+            grad_results.append((name, max_diff, mean_diff, passed))
+
+    # Only print on rank 0 to avoid duplicate output
+    if rank == 0:
+        print("\n=== Comparison Results ===")
+        print("\nLogits:")
+        print(f"  Max diff: {logits_max_diff:.2e}")
+        print(f"  Mean diff: {logits_mean_diff:.2e}")
+
+        print("\nMLP Parameter Gradients:")
+        if grad_results:
+            for name, max_diff, mean_diff, passed in grad_results:
+                status = "✓" if passed else "✗"
+                print(f"  {name}: max={max_diff:.2e}, mean={mean_diff:.2e} {status}")
+        else:
+            print("  (Gradients sharded to other ranks under FSDP2)")
+
+    return all_pass
+
+
+def main():
+    rank, world_size = setup_distributed()
+    device_mesh = init_device_mesh("cuda", (world_size,))
+
+    model_name = "Qwen/Qwen3-1.7B"
+    num_layers = 2
+
+    if rank == 0:
+        print(f"Running TiledMLP accuracy test with {world_size} GPUs")
+        print(f"Model: {model_name} ({num_layers} layers, from pretrained)")
+
+    dist.barrier()
+
+    # ========== Create Model 1: WITHOUT TiledMLP ==========
+    if rank == 0:
+        print("\n" + "=" * 60)
+        print("Creating Model 1 (without TiledMLP)")
+        print("=" * 60)
+
+    model1 = create_model(model_name, num_layers)
+    model1 = apply_fsdp2(model1, device_mesh)
+    model1 = model1.cuda()
+
+    # Create deterministic input
+    torch.manual_seed(42)
+    batch_size, seq_len = 2, 256
+    vocab_size = model1.config.vocab_size
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
+    labels = input_ids.clone()
+
+    # ========== Run Model 1: WITHOUT TiledMLP ==========
+    if rank == 0:
+        print("\n" + "=" * 60)
+        print("Running forward/backward on Model 1 (without TiledMLP)")
+        print("=" * 60)
+
+    logits1, grads1, loss1 = run_forward_backward(model1, input_ids, labels)
+    if rank == 0:
+        print(f"Loss: {loss1:.4f}")
+
+    # Free model1 memory before creating model2
+    del model1
+    torch.cuda.empty_cache()
+
+    dist.barrier()
+
+    # ========== Create Model 2, apply TiledMLP patch, then FSDP2 ==========
+    if rank == 0:
+        print("\n" + "=" * 60)
+        print("Creating Model 2 (with TiledMLP, patch before FSDP2)")
+        print("=" * 60)
+
+    model2 = create_model(model_name, num_layers)
+
+    # Apply TiledMLP patch AFTER model instantiation but BEFORE FSDP2 wrap
+    if rank == 0:
+        print("Applying TiledMLP monkey patch before FSDP2...")
+
+    from verl.models.transformers.tiled_mlp import apply_tiled_mlp_monkey_patch
+
+    apply_tiled_mlp_monkey_patch(num_shards=4, model_type="qwen3")
+
+    model2 = apply_fsdp2(model2, device_mesh)
+    model2 = model2.cuda()
+
+    dist.barrier()
+
+    # ========== Run Model 2: WITH TiledMLP ==========
+    if rank == 0:
+        print("\n" + "=" * 60)
+        print("Running forward/backward on Model 2 (with TiledMLP)")
+        print("=" * 60)
+
+    logits2, grads2, loss2 = run_forward_backward(model2, input_ids, labels)
+    if rank == 0:
+        print(f"Loss: {loss2:.4f}")
+
+    dist.barrier()
+
+    # ========== Compare Results ==========
+    all_pass = compare_results(logits1, grads1, logits2, grads2, rank)
+
+    dist.barrier()
+
+    if rank == 0:
+        print("\n" + "=" * 60)
+        print("SUMMARY")
+        print("=" * 60)
+        print(f"Loss diff: {abs(loss1 - loss2):.2e}")
+        print(f"All gradient checks: {'PASS' if all_pass else 'FAIL'}")
+
+    # Cleanup
+    del model2
+    torch.cuda.empty_cache()
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/models/test_transformer.py b/code/RL_model/verl/verl_train/tests/models/test_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd085497a16cd73e828bff596dd888d054827af
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/models/test_transformer.py
@@ -0,0 +1,239 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import (
+    ApertusConfig,
+    AutoModelForCausalLM,
+    AutoModelForTokenClassification,
+    GemmaConfig,
+    LlamaConfig,
+    MistralConfig,
+    Qwen2Config,
+)
+
+from verl.utils.device import get_device_name
+
+if get_device_name() == "cuda":
+    from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input
+elif get_device_name() == "npu":
+    from verl.utils.attention_utils import index_first_axis, pad_input, rearrange, unpad_input
+
+from verl.utils.model import compute_position_id_with_mask, create_random_mask
+from verl.utils.torch_functional import log_probs_from_logits_all_rmpad, masked_mean
+
+# TODO(sgm): add more models for test
+# we only need one scale for each model
+test_configs = [
+    LlamaConfig(num_hidden_layers=1),
+    MistralConfig(num_hidden_layers=1),
+    GemmaConfig(num_hidden_layers=1),
+    Qwen2Config(num_hidden_layers=1),
+    ApertusConfig(num_hidden_layers=1),
+]
+
+
+def test_hf_casual_models():
+    batch_size = 4
+    seqlen = 128
+    response_length = 127
+
+    for config in test_configs:
+        # config = AutoConfig.from_pretrained(test_case)
+        with torch.device(get_device_name()):
+            model = AutoModelForCausalLM.from_config(
+                config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+            )
+            model = model.to(device=get_device_name())
+        input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name())
+        attention_mask = create_random_mask(
+            input_ids=input_ids,
+            max_ratio_of_left_padding=0.1,
+            max_ratio_of_valid_token=0.8,
+            min_ratio_of_valid_token=0.5,
+        )
+        position_ids = compute_position_id_with_mask(
+            attention_mask
+        )  # TODO(sgm): we can construct the position_ids_rmpad here
+
+        input_ids_rmpad, indices, *_ = unpad_input(
+            input_ids.unsqueeze(-1), attention_mask
+        )  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(
+            rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices
+        ).transpose(0, 1)
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        logits_rmpad = model(
+            input_ids_rmpad, position_ids=position_ids_rmpad, use_cache=False
+        ).logits  # (1, total_nnz, vocab_size)
+
+        origin_logits = model(
+            input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, use_cache=False
+        ).logits
+        origin_logits_rmpad, origin_logits_indices, *_ = unpad_input(origin_logits, attention_mask)
+
+        logits_rmpad = logits_rmpad.squeeze(0)
+        log_probs = log_probs_from_logits_all_rmpad(
+            input_ids_rmpad=input_ids_rmpad,
+            logits_rmpad=logits_rmpad,
+            indices=indices,
+            batch_size=batch_size,
+            seqlen=seqlen,
+            response_length=response_length,
+        )  # (batch, seqlen)
+        origin_log_probs = log_probs_from_logits_all_rmpad(
+            input_ids_rmpad=input_ids_rmpad,
+            logits_rmpad=origin_logits_rmpad,
+            indices=origin_logits_indices,
+            batch_size=batch_size,
+            seqlen=seqlen,
+            response_length=response_length,
+        )  # (batch, seqlen)
+
+        torch.testing.assert_close(
+            masked_mean(log_probs, attention_mask[:, -response_length - 1 : -1]),
+            masked_mean(origin_log_probs, attention_mask[:, -response_length - 1 : -1]),
+            atol=1e-2,
+            rtol=1e-5,
+        )
+    print("Check pass")
+
+
+def test_hf_value_models():
+    batch_size = 4
+    seqlen = 128
+
+    for config in test_configs:
+        # config = AutoConfig.from_pretrained(test_case)
+        config.num_labels = 1
+        config.classifier_dropout = 0
+        config.hidden_dropout = 0
+        with torch.device(get_device_name()):
+            model = AutoModelForTokenClassification.from_config(
+                config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+            )
+            model = model.to(device=get_device_name())
+        input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name())
+        attention_mask = create_random_mask(
+            input_ids=input_ids,
+            max_ratio_of_left_padding=0.1,
+            max_ratio_of_valid_token=0.8,
+            min_ratio_of_valid_token=0.5,
+        )
+        position_ids = compute_position_id_with_mask(
+            attention_mask
+        )  # TODO(sgm): we can construct the position_ids_rmpad here
+
+        input_ids_rmpad, indices, *_ = unpad_input(
+            input_ids.unsqueeze(-1), attention_mask
+        )  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(
+            rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices
+        ).transpose(0, 1)
+
+        origin_logits = model(
+            input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, use_cache=False
+        ).logits
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        rmpad_logits = model(
+            input_ids_rmpad, position_ids=position_ids_rmpad, use_cache=False
+        ).logits  # (1, total_nnz, 1)
+        rmpad_logits = rmpad_logits.squeeze(0)
+        pad_logits = pad_input(rmpad_logits, indices, batch_size, seqlen=seqlen)
+
+        torch.testing.assert_close(
+            masked_mean(pad_logits, attention_mask[:, :, None]),
+            masked_mean(origin_logits, attention_mask[:, :, None]),
+            atol=1e-2,
+            rtol=1e-5,
+        )
+    print("Value model check pass")
+
+
+def test_attn_implementation_override():
+    """Test that attn_implementation override config is properly respected."""
+    # Test case 1: Test the actual extraction logic (no network required)
+    test_cases = [
+        ({}, "flash_attention_2"),  # Default case
+        ({"attn_implementation": "eager"}, "eager"),  # Override case
+        ({"attn_implementation": "sdpa"}, "sdpa"),  # Another override
+        ({"other_config": "value"}, "flash_attention_2"),  # No attn_implementation key
+    ]
+
+    for override_config, expected in test_cases:
+        actual = override_config.get("attn_implementation", "flash_attention_2")
+        assert actual == expected, f"Expected {expected}, got {actual} for config {override_config}"
+
+    # Test case 2: Test with local config creation (simulate FSDP worker behavior)
+    # Test default behavior
+    override_config_default = {}
+    attn_implementation_default = override_config_default.get("attn_implementation", "flash_attention_2")
+    assert attn_implementation_default == "flash_attention_2"
+
+    # Test override behavior
+    override_config_eager = {"attn_implementation": "eager"}
+    attn_implementation_eager = override_config_eager.get("attn_implementation", "flash_attention_2")
+    assert attn_implementation_eager == "eager"
+
+    # Test that we can create a config with specific attn_implementation
+    config_with_eager = LlamaConfig(num_hidden_layers=1, _attn_implementation="eager")
+    assert config_with_eager._attn_implementation == "eager"
+
+    config_with_flash = LlamaConfig(num_hidden_layers=1, _attn_implementation="flash_attention_2")
+    assert config_with_flash._attn_implementation == "flash_attention_2"
+
+    print("✓ All attn_implementation override config tests passed")
+
+
+def test_fsdp_worker_attn_implementation_integration():
+    """Test integration of attn_implementation with FSDP worker logic."""
+
+    # Mock the FSDP worker configuration scenario
+    mock_override_config = {"attn_implementation": "eager"}
+
+    # Test the exact logic used in FSDP workers
+    attn_implementation = mock_override_config.get("attn_implementation", "flash_attention_2")
+    assert attn_implementation == "eager"
+
+    # Test with empty config (should default)
+    mock_override_config_empty = {}
+    attn_implementation_default = mock_override_config_empty.get("attn_implementation", "flash_attention_2")
+    assert attn_implementation_default == "flash_attention_2"
+
+    # Test that the parameter would be passed correctly to both AutoConfig and Model
+    expected_calls = [
+        ("AutoConfig.from_pretrained", {"attn_implementation": attn_implementation}),
+        ("AutoModel.from_pretrained", {"attn_implementation": attn_implementation}),
+    ]
+
+    # Verify the parameter extraction works as expected
+    for call_name, expected_params in expected_calls:
+        assert expected_params["attn_implementation"] == "eager"
+
+    print("✓ FSDP worker integration test passed")
+
+
+if __name__ == "__main__":
+    test_hf_casual_models()
+    test_hf_value_models()
+    test_attn_implementation_override()
+    test_fsdp_worker_attn_implementation_integration()
diff --git a/code/RL_model/verl/verl_train/tests/models/test_transformers_ulysses.py b/code/RL_model/verl/verl_train/tests/models/test_transformers_ulysses.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3387927885f00cb928312bd955ab1210a067e6b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/models/test_transformers_ulysses.py
@@ -0,0 +1,283 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import copy
+from dataclasses import dataclass
+
+import pytest
+import torch
+import torch.distributed
+import transformers
+from packaging import version
+from torch.distributed import init_device_mesh
+from transformers import AutoModelForCausalLM, LlamaConfig, PretrainedConfig, Qwen2Config
+
+from verl.models.transformers.monkey_patch import apply_monkey_patch
+from verl.protocol import DataProto
+from verl.utils.device import get_device_name, get_torch_device
+from verl.utils.distributed import initialize_global_process_group
+from verl.utils.model import compute_position_id_with_mask, create_random_mask
+from verl.utils.ulysses import (
+    gather_outputs_and_unpad,
+    get_ulysses_sequence_parallel_world_size,
+    set_ulysses_sequence_parallel_group,
+    ulysses_pad_and_slice_inputs,
+)
+from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager
+
+if get_device_name() == "cuda":
+    from flash_attn.bert_padding import index_first_axis, rearrange, unpad_input
+elif get_device_name() == "npu":
+    from verl.utils.attention_utils import index_first_axis, rearrange, unpad_input
+
+# TODO(sgm): add more models for test
+# we only need one scale for each model
+
+
+@dataclass
+class SequenceParallelConfig:
+    config: PretrainedConfig
+    sp_size: int
+    is_valid: bool
+
+
+def test_configs():
+    configs = [
+        SequenceParallelConfig(
+            LlamaConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32), sp_size=8, is_valid=True
+        ),
+        SequenceParallelConfig(
+            Qwen2Config(num_hidden_layers=2, num_attention_heads=28, num_key_value_heads=4, hidden_size=3584),
+            sp_size=4,
+            is_valid=True,
+        ),
+        SequenceParallelConfig(
+            Qwen2Config(num_hidden_layers=2, num_attention_heads=28, num_key_value_heads=4, hidden_size=3584),
+            sp_size=8,
+            is_valid=False,
+        ),
+        SequenceParallelConfig(
+            Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4), sp_size=4, is_valid=True
+        ),
+        SequenceParallelConfig(
+            Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4), sp_size=8, is_valid=True
+        ),
+    ]
+
+    if version.parse(transformers.__version__) >= version.parse("4.56.0"):
+        from transformers import ApertusConfig
+
+        configs.append(
+            SequenceParallelConfig(
+                ApertusConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32, hidden_size=4096),
+                sp_size=8,
+                is_valid=True,
+            )
+        )
+
+    return configs
+
+
+def sync_model_parameters_global(layer):
+    # synchronize weights
+    for p in layer.parameters():
+        torch.distributed.broadcast(tensor=p.data, src=0)
+
+
+@pytest.mark.parametrize("test_config", test_configs())
+def test_hf_casual_fwd_bwd(test_config):
+    if not torch.distributed.is_initialized():
+        initialize_global_process_group()
+
+    context = contextlib.nullcontext() if test_config.is_valid else pytest.raises(AssertionError)
+    with context:
+        world_size = torch.distributed.get_world_size()
+        _hf_casual_fwd_bwd(test_config.config, test_config.sp_size, world_size // test_config.sp_size)
+
+    # TODO: seems not work, will cause `socketStartConnect: Connect to xxx failed : Software caused connection abort`
+    # torch.distributed.destroy_process_group()
+
+
+def _hf_casual_fwd(config, sp_size, dp_size):
+    assert get_torch_device().device_count() >= 2, "need at least 2 gpus for test"
+
+    ulysses_device_mesh = init_device_mesh(
+        device_type=get_device_name(), mesh_shape=(dp_size, sp_size), mesh_dim_names=("dp", "sp")
+    )
+    sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh)
+
+    batch_size = 1
+    seqlen = 128
+    # response_length = 127
+
+    # patch before load
+    with torch.device(get_device_name()):
+        model = AutoModelForCausalLM.from_config(
+            config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        )
+        apply_monkey_patch(model, sp_size)
+        model = model.to(device=get_device_name())
+        sync_model_parameters_global(model)
+
+    # different rank will generate different input_ids following fsdp
+    input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name())
+    attention_mask = create_random_mask(
+        input_ids=input_ids, max_ratio_of_left_padding=0, max_ratio_of_valid_token=0.9, min_ratio_of_valid_token=0.8
+    )
+    position_ids = compute_position_id_with_mask(
+        attention_mask
+    )  # TODO(sgm): we can construct the position_ids_rmpad here
+
+    model_inputs = {
+        "input_ids": input_ids.to(get_device_name()),
+        "attention_mask": attention_mask.to(get_device_name()),
+        "position_ids": position_ids.int().to(get_device_name()),
+    }
+
+    model_inputs = DataProto.from_dict(model_inputs)
+
+    # 1. perform ulysses forward
+    with sharding_manager:
+        model_inputs = sharding_manager.preprocess_data(model_inputs)
+        input_ids = model_inputs.batch["input_ids"]
+        attention_mask = model_inputs.batch["attention_mask"]
+        position_ids = model_inputs.batch["position_ids"]
+        input_ids_rmpad, indices, *_ = unpad_input(
+            input_ids.unsqueeze(-1), attention_mask
+        )  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(
+            rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices
+        ).transpose(0, 1)
+
+        # slice input tensor for ulysses
+        # input_ids are padded and sliced
+        # postition_ids are only padded but not sliced
+        input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs(
+            input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size()
+        )
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        logits_split_in_seq = model(
+            input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded, use_cache=False
+        ).logits  # (1, total_nnz/n, vocab_size)
+
+        # all_gather output
+        logits_full = gather_outputs_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size)
+
+    # 2. perform normal forward
+    set_ulysses_sequence_parallel_group(None)
+    logits_rmpad_local = model(
+        input_ids_rmpad, position_ids=position_ids_rmpad, use_cache=False
+    ).logits  # (1, total_nnz, vocab_size)
+
+    mean_local = logits_rmpad_local.mean()
+    mean_full = logits_full.mean()
+    torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=1e-5)
+
+
+def _hf_casual_fwd_bwd(config, sp_size, dp_size):
+    assert get_torch_device().device_count() >= 2, "need at least 2 gpus for test"
+
+    ulysses_device_mesh = init_device_mesh(
+        device_type=get_device_name(), mesh_shape=(dp_size, sp_size), mesh_dim_names=("dp", "sp")
+    )
+    sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh)
+
+    batch_size = 1
+    seqlen = 128
+    # response_length = 127
+
+    # patch before load
+    with torch.device(get_device_name()):
+        model = AutoModelForCausalLM.from_config(
+            config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        )
+        apply_monkey_patch(model, sp_size)
+        model = model.to(device=get_device_name())
+        sync_model_parameters_global(model)
+
+    # different rank will generate different input_ids following fsdp
+    input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device=get_device_name())
+    attention_mask = create_random_mask(
+        input_ids=input_ids, max_ratio_of_left_padding=0, max_ratio_of_valid_token=0.9, min_ratio_of_valid_token=0.8
+    )
+    position_ids = compute_position_id_with_mask(
+        attention_mask
+    )  # TODO(sgm): we can construct the position_ids_rmpad here
+
+    model_inputs = {
+        "input_ids": input_ids.to(get_device_name()),
+        "attention_mask": attention_mask.to(get_device_name()),
+        "position_ids": position_ids.int().to(get_device_name()),
+    }
+
+    model_inputs = DataProto.from_dict(model_inputs)
+
+    # 1. perform ulysses forward
+    with sharding_manager:
+        model_inputs = sharding_manager.preprocess_data(model_inputs)
+        input_ids = model_inputs.batch["input_ids"]
+        attention_mask = model_inputs.batch["attention_mask"]
+        position_ids = model_inputs.batch["position_ids"]
+        input_ids_rmpad, indices, *_ = unpad_input(
+            input_ids.unsqueeze(-1), attention_mask
+        )  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(
+            rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices
+        ).transpose(0, 1)
+
+        # slice input tensor for ulysses
+        # input_ids are padded and sliced
+        # postition_ids are only padded but not sliced
+        input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs(
+            input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size()
+        )
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        logits_split_in_seq = model(
+            input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded, use_cache=False
+        ).logits  # (1, total_nnz/n, vocab_size)
+
+        # all_gather output
+        logits_full = gather_outputs_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size)
+
+    # 2. perform normal forward
+    set_ulysses_sequence_parallel_group(None)
+    input_ids_full = copy.deepcopy(input_ids_rmpad)
+    position_ids_full = copy.deepcopy(position_ids_rmpad)
+    model_no_sp = copy.deepcopy(model)
+    logits_rmpad_local = model_no_sp(
+        input_ids_full, position_ids=position_ids_full, use_cache=False
+    ).logits  # (1, total_nnz, vocab_size)
+
+    mean_local = logits_rmpad_local.mean()
+    mean_full = logits_full.mean()
+
+    mean_full.backward()
+    mean_local.backward()
+
+    # 3. check the gradients
+    grad = model.model.layers[0].self_attn.q_proj.weight.grad
+    grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad
+    torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=3e-5)
+    # The check should be less strict because the gradient is not an averaged value.
+    torch.testing.assert_close(grad, grad_full, rtol=1e-2, atol=1e-3)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-svv"])
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_get_set_dispatch_collect_cpu.py b/code/RL_model/verl/verl_train/tests/single_controller/test_get_set_dispatch_collect_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b832da89910d1876fdaed7ad88e02170e5c35c1
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_get_set_dispatch_collect_cpu.py
@@ -0,0 +1,47 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+
+from verl.single_controller.base import Worker
+
+
+def test_get_set_dispatch_collect_cpu():
+    os.environ["RANK"] = "0"
+    os.environ["LOCAL_RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "2"
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12345"
+
+    ref = Worker()
+    ref._register_dispatch_collect_info(mesh_name="actor", dp_rank=0, is_collect=True)
+
+    actor = Worker()
+    actor._register_dispatch_collect_info(mesh_name="actor", dp_rank=1, is_collect=False)
+
+    actor_rollout_ref = Worker()
+    actor_rollout_ref.set_dispatch_collect(mesh_name="ref", **ref.get_dispatch_collect())
+    actor_rollout_ref.set_dispatch_collect(mesh_name="actor", **actor.get_dispatch_collect())
+
+    assert actor_rollout_ref._query_dispatch_info("ref") == 0
+    assert actor_rollout_ref._query_collect_info("ref")
+    assert actor_rollout_ref._query_dispatch_info("actor") == 1
+    assert not actor_rollout_ref._query_collect_info("actor")
+
+    # test conflict mesh_name
+    actor2 = Worker()
+    actor2._register_dispatch_collect_info(mesh_name="actor", dp_rank=1, is_collect=False)
+    with pytest.raises(AssertionError):
+        actor_rollout_ref.set_dispatch_collect(mesh_name="actor", **actor2.get_dispatch_collect())
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_nested_worker.py b/code/RL_model/verl/verl_train/tests/single_controller/test_nested_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..99145e5949ee9bf03f85f4201f1e025b42b4e200
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_nested_worker.py
@@ -0,0 +1,75 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ray
+
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils.device import get_device_name
+
+
+class TestActor(Worker):
+    # TODO: pass *args and **kwargs is bug prone and not very convincing
+    def __init__(self, x) -> None:
+        super().__init__()
+        self.a = x
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def get(self):
+        return self.a + self.rank
+
+
+class TestHighLevelActor(Worker):
+    def __init__(self, x=None) -> None:
+        super().__init__()
+        self.test_actor = TestActor(x=x)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def get(self):
+        return self.test_actor.get()
+
+
+def test_nested_worker():
+    ray.init(num_cpus=100)
+
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=ray.remote(TestActor), x=2)
+
+    worker_group = RayWorkerGroup(
+        resource_pool=resource_pool,
+        ray_cls_with_init=class_with_args,
+        name_prefix="worker_group_basic",
+        device_name=get_device_name(),
+    )
+
+    output = worker_group.get()
+
+    assert output == [2, 3, 4, 5]
+
+    class_with_args = RayClassWithInitArgs(cls=ray.remote(TestHighLevelActor), x=2)
+    high_level_worker_group = RayWorkerGroup(
+        resource_pool=resource_pool,
+        ray_cls_with_init=class_with_args,
+        name_prefix="worker_group_basic_2",
+        device_name=get_device_name(),
+    )
+
+    output_1 = high_level_worker_group.get()
+
+    assert output_1 == [2, 3, 4, 5]
+
+    ray.shutdown()
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_ray_collectives.py b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_collectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..3722a8f8029313bad6070d8d0ed2b9a29e4f3770
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_collectives.py
@@ -0,0 +1,113 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test for using ray collective group.
+Suppose we Actor and Rollout. Actor contains 4 workers and Rollout contains 2 workers. We established a Worker to
+Rollout relationship by using collective groups
+Actor: rank 0, 1 - Rollout rank 0
+Rollout rank 2, 3 - Rollout rank 1
+Then, we initiate 4 p2p comms from actor to rollout
+"""
+
+import ray
+import ray.util.collective as collective
+import torch
+
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+
+
+@ray.remote
+class Actor(Worker):
+    @register(Dispatch.ONE_TO_ALL)
+    def init(self):
+        remote_rank = self.rank // 2
+        self.group_name = f"A{self.rank}_R{remote_rank}"
+        collective.init_collective_group(world_size=2, rank=0, backend="nccl", group_name=self.group_name)
+
+    @register(Dispatch.ONE_TO_ALL, blocking=False)
+    def send_tensors(self):
+        tensor = torch.ones(size=(4,), dtype=torch.float32, device="cuda") * self.rank
+        collective.send(tensor=tensor, dst_rank=1, group_name=self.group_name)
+
+
+@ray.remote
+class Rollout(Worker):
+    @register(Dispatch.ONE_TO_ALL)
+    def init(self):
+        self.remote_first_rank = self.rank * 2
+        self.remote_second_rank = self.remote_first_rank + 1
+        self.first_group_name = f"A{self.remote_first_rank}_R{self.rank}"
+        self.second_group_name = f"A{self.remote_second_rank}_R{self.rank}"
+
+        collective.init_collective_group(world_size=2, rank=1, backend="nccl", group_name=self.first_group_name)
+        collective.init_collective_group(world_size=2, rank=1, backend="nccl", group_name=self.second_group_name)
+
+    @register(Dispatch.ONE_TO_ALL, blocking=False)
+    def receive_tensors(self):
+        self.tensor1 = torch.randn(size=(4,), dtype=torch.float32, device="cuda")
+        self.tensor2 = torch.randn(size=(4,), dtype=torch.float32, device="cuda")
+
+        collective.recv(self.tensor1, src_rank=0, group_name=self.first_group_name)
+        collective.recv(self.tensor2, src_rank=0, group_name=self.second_group_name)
+
+    @register(Dispatch.ONE_TO_ALL)
+    def get_tensors(self):
+        return {f"src_{self.remote_first_rank}": self.tensor1, f"src_{self.remote_second_rank}": self.tensor2}
+
+
+def test_ray_collective_group():
+    ray.init()
+
+    actor_resource_pool = RayResourcePool([4])
+    rollout_resource_pool = RayResourcePool([2])
+
+    actor_cls = RayClassWithInitArgs(cls=Actor)
+    rollout_cls = RayClassWithInitArgs(cls=Rollout)
+
+    actor_wg = RayWorkerGroup(
+        resource_pool=actor_resource_pool, ray_cls_with_init=actor_cls, name_prefix="collective_group_actor"
+    )
+    rollout_wg = RayWorkerGroup(
+        resource_pool=rollout_resource_pool, ray_cls_with_init=rollout_cls, name_prefix="collective_group_rollout"
+    )
+
+    actor_wg.init()
+    rollout_wg.init()
+
+    out1 = actor_wg.send_tensors()
+    out2 = rollout_wg.receive_tensors()
+
+    # block to wait
+    ray.get(out1)
+    ray.get(out2)
+
+    output = rollout_wg.get_tensors()
+
+    rollout_0_output = output[0]
+    rollout_1_output = output[1]
+
+    output = rollout_0_output | rollout_1_output
+
+    print(output)
+
+    for i in range(4):
+        assert torch.sum(output[f"src_{i}"]).item() == 4 * i
+
+    ray.shutdown()
+
+
+if __name__ == "__main__":
+    test_ray_collective_group()
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_ray_local_envs_on_cpu.py b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_local_envs_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c51beeaf3f8600387ce14fe63c97a5c804c4237
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_local_envs_on_cpu.py
@@ -0,0 +1,91 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+e2e test verl.single_controller.ray
+"""
+
+import os
+
+import ray
+
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+
+
+@ray.remote
+class TestActor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def getenv(self, key):
+        val = os.getenv(key, f"{key} not set")
+        return val
+
+
+def test_basics():
+    ray.init(num_cpus=100)
+
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=False)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+
+    worker_group = RayWorkerGroup(
+        resource_pool=resource_pool, ray_cls_with_init=class_with_args, name_prefix="worker_group_basic"
+    )
+
+    output = worker_group.execute_all_sync("getenv", key="RAY_LOCAL_WORLD_SIZE")
+    assert output == ["4", "4", "4", "4"]
+
+    ray.shutdown()
+
+
+def test_customized_worker_env():
+    ray.init(num_cpus=100)
+
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=False)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+
+    worker_group = RayWorkerGroup(
+        resource_pool=resource_pool,
+        ray_cls_with_init=class_with_args,
+        name_prefix="worker_group_customized",
+        worker_env={
+            "test_key": "test_value",  # new key will be appended
+        },
+    )
+
+    output = worker_group.execute_all_sync("getenv", key="test_key")
+    assert output == ["test_value", "test_value", "test_value", "test_value"]
+
+    try:
+        worker_group = RayWorkerGroup(
+            resource_pool=resource_pool,
+            ray_cls_with_init=class_with_args,
+            name_prefix="worker_group_error",
+            worker_env={
+                "WORLD_SIZE": "100",  # override system env will result in error
+            },
+        )
+    except ValueError as e:
+        assert "WORLD_SIZE" in str(e)
+    else:
+        raise ValueError("test failed")
+
+    ray.shutdown()
+
+
+if __name__ == "__main__":
+    test_basics()
+    test_customized_worker_env()
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_ray_utils_on_cpu.py b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_utils_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36497d210f6ec5daa8b9d559987f5dcc3974af2
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_ray_utils_on_cpu.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import ray
+
+from verl.utils.ray_utils import parallel_put
+
+
+# Initialize Ray for testing if not already done globally
+@pytest.fixture()
+def init_ray():
+    ray.init(num_cpus=4)
+    yield
+    ray.shutdown()
+
+
+def test_parallel_put_basic(init_ray):
+    data = [1, "hello", {"a": 2}, [3, 4]]
+    refs = parallel_put(data)
+    assert len(refs) == len(data)
+    retrieved_data = [ray.get(ref) for ref in refs]
+    assert retrieved_data == data
+
+
+def test_parallel_put_empty(init_ray):
+    data = []
+    with pytest.raises(AssertionError):
+        _ = parallel_put(data)
+
+
+def test_parallel_put_workers(init_ray):
+    data = list(range(20))
+    # Test with specific number of workers
+    refs = parallel_put(data, max_workers=4)
+    assert len(refs) == len(data)
+    retrieved_data = [ray.get(ref) for ref in refs]
+    assert retrieved_data == data
+    # Test with default workers (should cap)
+    refs_default = parallel_put(data)
+    assert len(refs_default) == len(data)
+    retrieved_data_default = [ray.get(ref) for ref in refs_default]
+    assert retrieved_data_default == data
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_split_resource_pool.py b/code/RL_model/verl/verl_train/tests/single_controller/test_split_resource_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb32606cf36e83bf41fb59154ce72c51928b804
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_split_resource_pool.py
@@ -0,0 +1,181 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import ray
+import torch
+
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray.base import (
+    RayClassWithInitArgs,
+    RayResourcePool,
+    RayWorkerGroup,
+    split_resource_pool,
+)
+from verl.utils.device import get_device_name, get_nccl_backend
+
+
+@ray.remote
+class Actor(Worker):
+    def __init__(self, worker_id) -> None:
+        super().__init__()
+        self.worker_id = worker_id
+        self.temp_tensor = torch.rand(4096, 4096).to(get_device_name())
+
+        if not torch.distributed.is_initialized():
+            rank = int(os.environ.get("RANK", 0))
+            world_size = int(os.environ.get("WORLD_SIZE", 1))
+            torch.distributed.init_process_group(backend=get_nccl_backend(), world_size=world_size, rank=rank)
+
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def add(self, data: DataProto):
+        data.batch["a"] += self.rank + self.worker_id
+        return data
+
+
+def test_split_resource_pool_with_split_size():
+    ray.init()
+    # assume we have 2 nodes, with 4 GPUs each
+    global_resource_pool = RayResourcePool(process_on_nodes=[4, 4])
+    global_resource_pool.get_placement_groups(device_name=get_device_name())
+
+    # first 4 gpus for actor_1, last 4 gpus for actor_2
+    actor_1_resource_pool, actor_2_resource_pool = split_resource_pool(resource_pool=global_resource_pool, split_size=4)
+    actor_cls_1 = RayClassWithInitArgs(cls=Actor, worker_id=0)
+    actor_cls_2 = RayClassWithInitArgs(cls=Actor, worker_id=100)
+    actor_worker_1 = RayWorkerGroup(
+        resource_pool=actor_1_resource_pool, ray_cls_with_init=actor_cls_1, device_name=get_device_name()
+    )
+    actor_worker_2 = RayWorkerGroup(
+        resource_pool=actor_2_resource_pool, ray_cls_with_init=actor_cls_2, device_name=get_device_name()
+    )
+    assert actor_worker_1.world_size == 4
+    assert actor_worker_2.world_size == 4
+
+    data = DataProto.from_dict({"a": torch.zeros(8)})
+    actor_output_1 = actor_worker_1.add(data)
+    actor_output_2 = actor_worker_2.add(data)
+    assert actor_output_1.batch["a"].tolist() == [0, 0, 1, 1, 2, 2, 3, 3]
+    assert actor_output_2.batch["a"].tolist() == [100, 100, 101, 101, 102, 102, 103, 103]
+
+    ray.shutdown()
+
+
+def test_split_resource_pool_with_split_size_list():
+    ray.init()
+    # assume we have 4 nodes, with 2 GPUs each
+    global_resource_pool = RayResourcePool(process_on_nodes=[2, 2, 2, 2])
+    global_resource_pool.get_placement_groups(device_name=get_device_name())
+
+    # first 2 gpus for actor_1, last 6 gpus for actor_2
+    actor_1_resource_pool, actor_2_resource_pool = split_resource_pool(
+        resource_pool=global_resource_pool,
+        split_size=[2, 6],
+    )
+    actor_cls_1 = RayClassWithInitArgs(cls=Actor, worker_id=0)
+    actor_cls_2 = RayClassWithInitArgs(cls=Actor, worker_id=100)
+    actor_worker_1 = RayWorkerGroup(
+        resource_pool=actor_1_resource_pool, ray_cls_with_init=actor_cls_1, device_name=get_device_name()
+    )
+    actor_worker_2 = RayWorkerGroup(
+        resource_pool=actor_2_resource_pool, ray_cls_with_init=actor_cls_2, device_name=get_device_name()
+    )
+    assert actor_worker_1.world_size == 2
+    assert actor_worker_2.world_size == 6
+
+    data_1 = DataProto.from_dict({"a": torch.zeros(4)})
+    data_2 = DataProto.from_dict({"a": torch.zeros(6)})
+    actor_output_1 = actor_worker_1.add(data_1)
+    actor_output_2 = actor_worker_2.add(data_2)
+    print(actor_output_1.batch["a"].tolist())
+    print(actor_output_2.batch["a"].tolist())
+    assert actor_output_1.batch["a"].tolist() == [0, 0, 1, 1]
+    assert actor_output_2.batch["a"].tolist() == [100, 101, 102, 103, 104, 105]
+
+    ray.shutdown()
+
+
+def test_split_resource_pool_with_split_size_list_cross_nodes():
+    ray.init()
+    # assume we have 4 nodes, with 2 GPUs each
+    global_resource_pool = RayResourcePool(process_on_nodes=[4, 4])
+    global_resource_pool.get_placement_groups(device_name=get_device_name())
+
+    # first 2 gpus for actor_1, last 6 gpus for actor_2
+    actor_1_resource_pool, actor_2_resource_pool = split_resource_pool(
+        resource_pool=global_resource_pool,
+        split_size=[2, 6],
+    )
+    actor_cls_1 = RayClassWithInitArgs(cls=Actor, worker_id=0)
+    actor_cls_2 = RayClassWithInitArgs(cls=Actor, worker_id=100)
+    actor_worker_1 = RayWorkerGroup(
+        resource_pool=actor_1_resource_pool, ray_cls_with_init=actor_cls_1, device_name=get_device_name()
+    )
+    actor_worker_2 = RayWorkerGroup(
+        resource_pool=actor_2_resource_pool, ray_cls_with_init=actor_cls_2, device_name=get_device_name()
+    )
+
+    assert actor_worker_1.world_size == 2
+    assert actor_worker_2.world_size == 6
+
+    data_1 = DataProto.from_dict({"a": torch.zeros(4)})
+    data_2 = DataProto.from_dict({"a": torch.zeros(6)})
+    actor_output_1 = actor_worker_1.add(data_1)
+    actor_output_2 = actor_worker_2.add(data_2)
+    print(actor_output_1.batch["a"].tolist())
+    print(actor_output_2.batch["a"].tolist())
+    assert actor_output_1.batch["a"].tolist() == [0, 0, 1, 1]
+    assert actor_output_2.batch["a"].tolist() == [100, 101, 102, 103, 104, 105]
+
+    ray.shutdown()
+
+
+def test_split_resource_pool_with_split_twice():
+    ray.init()
+
+    # assume we have 4 nodes, with 2 GPUs each
+    global_resource_pool = RayResourcePool(process_on_nodes=[2, 2, 2, 2])
+    global_resource_pool.get_placement_groups(device_name=get_device_name())
+
+    # actors with [2, 1, 1, 1, 1, 2] (split twice)
+    rp_1, rp_2, rp_3 = split_resource_pool(
+        resource_pool=global_resource_pool,
+        split_size=[2, 4, 2],
+    )
+    rp_2_1, rp_2_2, rp_2_3, rp_2_4 = split_resource_pool(
+        resource_pool=rp_2,
+        split_size=1,
+    )
+    fp_list = [rp_1, rp_2_1, rp_2_2, rp_2_3, rp_2_4, rp_3]
+    correct_world_size = [2, 1, 1, 1, 1, 2]
+    correct_output = [
+        [0.0, 0.0, 1.0, 1.0],  # 2 worker
+        [100.0, 100.0, 100.0, 100.0],  # 1 worker
+        [200.0, 200.0, 200.0, 200.0],  # 1 worker
+        [300.0, 300.0, 300.0, 300.0],  # 1 worker
+        [400.0, 400.0, 400.0, 400.0],  # 1 worker
+        [500.0, 500.0, 501.0, 501.0],  # 2 worker
+    ]
+    for idx, rp in enumerate(fp_list):
+        actor_cls = RayClassWithInitArgs(cls=Actor, worker_id=idx * 100)
+        actor_worker = RayWorkerGroup(resource_pool=rp, ray_cls_with_init=actor_cls, device_name=get_device_name())
+        data = DataProto.from_dict({"a": torch.zeros(4)})
+        actor_output = actor_worker.add(data)
+        assert actor_worker.world_size == correct_world_size[idx]
+        assert actor_output.batch["a"].tolist() == correct_output[idx]
+
+    ray.shutdown()
diff --git a/code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_basics.py b/code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_basics.py
new file mode 100644
index 0000000000000000000000000000000000000000..13075d7b8ec4b3ec684894ac705c2cb887412fce
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_basics.py
@@ -0,0 +1,147 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+e2e test verl.single_controller.ray
+"""
+
+import ray
+import torch
+
+from verl.single_controller.base.decorator import Dispatch, Execute, collect_all_to_all, register
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils.device import get_device_name
+
+
+def two_to_all_dispatch_fn(worker_group, *args, **kwargs):
+    """
+    Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker.
+    """
+    for arg in args:
+        assert len(arg) == 2
+        for i in range(worker_group.world_size - 2):
+            arg.append(arg[i % 2])
+    for k, v in kwargs.items():
+        assert len(v) == 2
+        for i in range(worker_group.world_size - 2):
+            v.append(v[i % 2])
+    return args, kwargs
+
+
+def get_ray_remote_options() -> str:
+    """Function that gets the torch.device based on the current machine.
+    This currently only supports CPU, CUDA, NPU.
+    Returns:
+        device
+    """
+    if get_device_name() == "cuda":
+        return dict(num_gpus=0.1)
+    elif get_device_name() == "npu":
+        return dict(resources={"NPU": 0.1})
+    return dict(num_cpus=0.1)
+
+
+@ray.remote
+class TestActor(Worker):
+    # TODO: pass *args and **kwargs is bug prone and not very convincing
+    def __init__(self, x) -> None:
+        super().__init__()
+        self._x = x
+
+    def foo(self, y):
+        return self._x + y
+
+    @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
+    def foo_rank_zero(self, x, y):
+        return self._x + y + x
+
+    @register(Dispatch.ONE_TO_ALL, blocking=False)
+    def foo_one_to_all(self, x, y):
+        return self._x + y + x
+
+    @register(Dispatch.ALL_TO_ALL, blocking=False)
+    def foo_all_to_all(self, x, y):
+        return self._x + y + x
+
+    @register(dispatch_mode={"dispatch_fn": two_to_all_dispatch_fn, "collect_fn": collect_all_to_all})
+    def foo_custom(self, x, y):
+        return self._x + y + x
+
+
+@ray.remote(**get_ray_remote_options())
+def remote_call_wg(worker_names):
+    class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
+    worker_group = RayWorkerGroup.from_detached(
+        worker_names=worker_names, ray_cls_with_init=class_with_args, name_prefix=None
+    )
+    print(worker_group.worker_names)
+
+    output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6])
+    assert output_ref == [8, 10, 8, 10]
+
+    output_ref = worker_group.foo_rank_zero(x=1, y=2)
+    assert output_ref == 5
+
+    return worker_group.worker_names
+
+
+def add_one(data):
+    data = data.to(get_device_name())
+    data += 1
+    data = data.to("cpu")
+    return data
+
+
+def test_basics():
+    ray.init(num_cpus=100)
+
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
+
+    worker_group = RayWorkerGroup(
+        resource_pool=resource_pool,
+        ray_cls_with_init=class_with_args,
+        name_prefix="worker_group_basic",
+        device_name=get_device_name(),
+    )
+
+    print(worker_group.worker_names)
+
+    # this will wait for all the results
+    output = worker_group.execute_all_sync("foo", y=3)
+    assert output == [5, 5, 5, 5]
+
+    # this is a list of object reference. It won't block.
+    output_ref = worker_group.execute_all_async("foo", y=4)
+    print(output_ref)
+
+    assert ray.get(output_ref) == [6, 6, 6, 6]
+
+    output_ref = worker_group.foo_one_to_all(x=1, y=2)
+    assert ray.get(output_ref) == [5, 5, 5, 5]
+
+    output_ref = worker_group.foo_all_to_all(x=[1, 2, 3, 4], y=[5, 6, 7, 8])
+    assert ray.get(output_ref) == [8, 10, 12, 14]
+
+    print(ray.get(remote_call_wg.remote(worker_group.worker_names)))
+
+    output = worker_group.execute_func_rank_zero(add_one, torch.ones(2, 2))
+    torch.testing.assert_close(output, torch.ones(2, 2) + 1)
+
+    ray.shutdown()
+
+
+if __name__ == "__main__":
+    test_basics()
diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/README.md b/code/RL_model/verl/verl_train/tests/special_distributed/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2f865e8bf95a673a0d6f56b74c7a2c12535faf2
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_distributed/README.md
@@ -0,0 +1 @@
+This folder is reserved for unit tests (instead of end-to-end tests) that require multiple GPUs.
diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/run_all.sh b/code/RL_model/verl/verl_train/tests/special_distributed/run_all.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3d6c5c71e54a1d6000025840b1abc783f56b60d5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_distributed/run_all.sh
@@ -0,0 +1,19 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+
+set -e -x
+torchrun --nproc-per-node=4 --standalone tests/special_distributed/test_tensor_dict.py
+torchrun --nproc-per-node=4 --standalone tests/special_distributed/test_torch_functional.py
diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_fsdp_ckpt.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_fsdp_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c9b497c47cb9359efb6c9c598391ffb0493cb40
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_fsdp_ckpt.py
@@ -0,0 +1,165 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import tempfile
+
+import torch
+import torch.distributed
+from torch.distributed import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2Config
+
+from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
+from verl.utils.device import get_device_name, get_torch_device
+from verl.utils.distributed import initialize_global_process_group
+from verl.utils.fsdp_utils import MixedPrecisionPolicy, apply_fsdp2
+
+
+def create_random_input_ids(batch_size, seq_len, vocab_size):
+    if get_device_name() == "cuda":
+        from flash_attn.bert_padding import unpad_input
+    elif get_device_name() == "npu":
+        from verl.utils.attention_utils import unpad_input
+    from verl.utils.model import compute_position_id_with_mask, create_random_mask
+
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=get_device_name())
+
+    attention_mask = create_random_mask(
+        input_ids, max_ratio_of_left_padding=0.1, min_ratio_of_valid_token=0.5, max_ratio_of_valid_token=0.7
+    )
+    position_ids = compute_position_id_with_mask(attention_mask)
+
+    input_ids = unpad_input(input_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1)
+    position_ids = unpad_input(position_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1)
+    return input_ids, position_ids
+
+
+def test_fsdp_ckpt(strategy="fsdp"):
+    assert get_torch_device().device_count() >= 2, "need at least 2 gpus for test"
+    local_rank, rank, world_size = initialize_global_process_group()
+    device_mesh = init_device_mesh(get_device_name(), mesh_shape=(world_size,), mesh_dim_names=("dp",))
+
+    model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
+    config = Qwen2Config(num_hidden_layers=1)
+
+    with torch.device(get_device_name()):
+        model = AutoModelForCausalLM.from_config(
+            config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        )
+        model = model.to(device=get_device_name())
+
+    # Wrap model with FSDP
+    if strategy == "fsdp":
+        mixed_precision = MixedPrecision(
+            param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32
+        )
+
+        model = FSDP(
+            model,
+            use_orig_params=False,
+            device_id=get_torch_device().current_device(),
+            sharding_strategy=ShardingStrategy.FULL_SHARD,
+            mixed_precision=mixed_precision,
+            device_mesh=device_mesh,
+        )
+    else:
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=torch.bfloat16, reduce_dtype=torch.float32, cast_forward_inputs=True
+        )
+        fsdp_kwargs = {
+            "mesh": device_mesh,
+            "mp_policy": mp_policy,
+        }
+        apply_fsdp2(model, fsdp_kwargs, {})
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
+
+    # Create checkpoint manager
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    checkpoint_manager = FSDPCheckpointManager(
+        model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, tokenizer=tokenizer
+    )
+
+    # Generate sample input
+    batch_size = 10
+    seq_len = 1024
+    vocab_size = config.vocab_size
+    # First input for initial update
+    input_ids1, position_ids1 = create_random_input_ids(batch_size, seq_len, vocab_size)
+
+    # Second input for verification
+    input_ids2, position_ids2 = create_random_input_ids(batch_size, seq_len, vocab_size)
+
+    # Step 1: Initial update and save checkpoint
+    outputs1 = model(input_ids=input_ids1, position_ids=position_ids1)
+    loss1 = outputs1.logits.mean()
+    loss1.backward()
+    optimizer.step()
+    lr_scheduler.step()
+    optimizer.zero_grad()
+
+    # Save checkpoint after first update
+    temp_dir = tempfile.mkdtemp()
+    checkpoint_path = os.path.join(temp_dir, "checkpoint")
+    checkpoint_manager.save_checkpoint(local_path=checkpoint_path, hdfs_path=None, global_step=0)
+    saved_state_dict = model.state_dict()
+
+    # Step 2: Second update and forward pass
+    outputs2 = model(input_ids=input_ids2, position_ids=position_ids2)
+    loss2 = outputs2.logits.mean()
+    loss2.backward()
+    optimizer.step()
+    lr_scheduler.step()
+    optimizer.zero_grad()
+
+    # Record logits after second update
+    with torch.no_grad():
+        logits_before_load = model(input_ids=input_ids2, position_ids=position_ids2).logits
+
+    # Step 3: Load checkpoint and repeat second update
+    checkpoint_manager.load_checkpoint(checkpoint_path)
+    loaded_state_dict = model.state_dict()
+    for key in loaded_state_dict:
+        assert key in saved_state_dict, f"Key {key} not found in saved state dict"
+        torch.testing.assert_close(loaded_state_dict[key], saved_state_dict[key], atol=0.0, rtol=0.0)
+
+    # Repeat the second update with same input
+    outputs3 = model(input_ids=input_ids2, position_ids=position_ids2)
+    loss3 = outputs3.logits.mean()
+    loss3.backward()
+    optimizer.step()
+    lr_scheduler.step()
+    optimizer.zero_grad()
+
+    # Record logits after loaded checkpoint and update
+    with torch.no_grad():
+        logits_after_load = model(input_ids=input_ids2, position_ids=position_ids2).logits
+
+    # Step 4: Verify outputs match
+    torch.testing.assert_close(logits_before_load, logits_after_load, atol=0.0, rtol=0.0)
+    print("Checkpoint save/load test passed!")
+
+    # Cleanup
+    shutil.rmtree(temp_dir)
+    torch.distributed.barrier()
+    torch.distributed.destroy_process_group()
+
+
+if __name__ == "__main__":
+    strategy = os.environ.get("STRATEGY", "fsdp")
+    os.environ["FLASH_ATTENTION_DETERMINISTIC"] = "1"
+    test_fsdp_ckpt(strategy=strategy)
diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_mcore_config_converter.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_mcore_config_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8f24c49911ed7b1fb1d73740dfc150e57dade0d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_mcore_config_converter.py
@@ -0,0 +1,100 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import megatron.core.parallel_state as mpu
+import torch
+from megatron.core.transformer import MLATransformerConfig, TransformerConfig
+from transformers import AutoConfig, PretrainedConfig
+
+from verl.models.mcore import hf_to_mcore_config
+from verl.utils.distributed import destroy_global_process_group, initialize_global_process_group
+
+TEST_MODELS = [
+    "Qwen/Qwen2.5-7B",  # Qwen2 dense
+    "Qwen/Qwen3-8B",  # Qwen3 dense
+    "deepseek-ai/deepseek-coder-1.3b-instruct",  # deepseek dense
+    "Qwen/Qwen2-57B-A14B",  # Qwen2 moe
+    "Qwen/Qwen3-30B-A3B",  # Qwen3 moe
+    # "mistralai/Mixtral-8x7B-v0.1",  # Mixtral # require authentication
+    "deepseek-ai/DeepSeek-V3-Base",  # Deepseek V3
+]
+
+
+def check_config_converter_results(tf_config: TransformerConfig | MLATransformerConfig, hf_config: PretrainedConfig):
+    assert tf_config.num_layers == hf_config.num_hidden_layers, (
+        f"Number of layers mismatch: {tf_config.num_layers} != {hf_config.num_hidden_layers}"
+    )
+    assert tf_config.hidden_size == hf_config.hidden_size, (
+        f"Hidden size mismatch: {tf_config.hidden_size} != {hf_config.hidden_size}"
+    )
+    assert tf_config.num_attention_heads == hf_config.num_attention_heads, (
+        f"Number of attention heads mismatch: {tf_config.num_attention_heads} != {hf_config.num_attention_heads}"
+    )
+    assert tf_config.num_query_groups == hf_config.num_key_value_heads, (
+        f"Number of query groups mismatch: {tf_config.num_query_groups} != {hf_config.num_key_value_heads}"
+    )
+    assert tf_config.ffn_hidden_size == hf_config.intermediate_size, (
+        f"FFN hidden size mismatch: {tf_config.ffn_hidden_size} != {hf_config.intermediate_size}"
+    )
+    assert tf_config.attention_dropout == hf_config.attention_dropout, (
+        f"Attention dropout mismatch: {tf_config.attention_dropout} != {hf_config.attention_dropout}"
+    )
+    assert tf_config.hidden_dropout == getattr(hf_config, "hidden_dropout", 0.0), (
+        f"Hidden dropout mismatch: {tf_config.hidden_dropout} != {getattr(hf_config, 'hidden_dropout', 0.0)}"
+    )
+    if getattr(hf_config, "head_dim", None) is not None:
+        assert tf_config.kv_channels == getattr(hf_config, "head_dim", None), (
+            f"Head dim mismatch: {tf_config.kv_channels} != {getattr(hf_config, 'head_dim', None)}"
+        )
+    assert tf_config.layernorm_epsilon == hf_config.rms_norm_eps, (
+        f"Layernorm epsilon mismatch: {tf_config.layernorm_epsilon} != {hf_config.rms_norm_eps}"
+    )
+
+
+def modify_hf_config(name: str, hf_config: PretrainedConfig):
+    if name == "deepseek-ai/DeepSeek-V3-Base":
+        hf_config.num_nextn_predict_layers = 0
+        hf_config.quantization_config = None
+    return hf_config
+
+
+def test_mcore_config_converter():
+    """
+    Test the conversion of Hugging Face model configurations to MCore configurations.
+    """
+    local_rank, rank, world_size = initialize_global_process_group()
+    mpu.initialize_model_parallel(
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=2,
+        virtual_pipeline_model_parallel_size=None,
+        use_sharp=False,
+        context_parallel_size=2,
+        expert_model_parallel_size=1,
+        expert_tensor_parallel_size=None,
+        nccl_communicator_config_path=None,
+    )
+    for model_name in TEST_MODELS:
+        print(f"testing {model_name}")
+        hf_config = AutoConfig.from_pretrained(os.path.expanduser(f"~/models/configs/{model_name}/config.json"))
+        hf_config = modify_hf_config(model_name, hf_config)
+        tf_config = hf_to_mcore_config(hf_config, torch.bfloat16)
+        check_config_converter_results(tf_config, hf_config)
+
+    destroy_global_process_group()
+
+
+if __name__ == "__main__":
+    test_mcore_config_converter()
diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_tensor_dict.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_tensor_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..565f8a8120845cddb8e166eb9f08f181dc2b6cff
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_tensor_dict.py
@@ -0,0 +1,126 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+os.environ["NCCL_DEBUG"] = "WARN"
+
+import numpy as np
+import torch
+import torch.distributed
+
+from verl.protocol import DataProto, all_gather_data_proto
+from verl.utils.device import get_device_name
+from verl.utils.distributed import initialize_global_process_group
+
+
+def test_all_gather_data_proto():
+    device_mesh = torch.distributed.device_mesh.init_device_mesh(
+        get_device_name(), mesh_shape=[2, 2], mesh_dim_names=["dp", "tp"]
+    )
+
+    global_rank = torch.distributed.get_rank()
+
+    obs = torch.tensor([[1 * global_rank, 2 * global_rank + 1], [3 * global_rank, 4 * global_rank + 1]])
+
+    labels = ["a", "b"] if global_rank % 2 == 0 else ["b", "a"]
+    labels = np.array(labels, dtype=object)
+    data = DataProto.from_dict(tensors={"obs": obs}, non_tensors={"labels": labels}, meta_info={"info": "test_info"})
+
+    all_gather_data_proto(data=data, process_group=device_mesh.get_group("dp"))
+
+    if global_rank == 0:
+        expected_obs = torch.tensor([[0, 1], [0, 1], [2, 5], [6, 9]], device=get_device_name())
+        expected_labels = ["a", "b", "a", "b"]
+    elif global_rank == 1:
+        expected_obs = torch.tensor([[1, 3], [3, 5], [3, 7], [9, 13]], device=get_device_name())
+        expected_labels = ["b", "a", "b", "a"]
+    elif global_rank == 2:
+        expected_obs = torch.tensor([[0, 1], [0, 1], [2, 5], [6, 9]], device=get_device_name())
+        expected_labels = ["a", "b", "a", "b"]
+    elif global_rank == 3:
+        expected_obs = torch.tensor([[1, 3], [3, 5], [3, 7], [9, 13]], device=get_device_name())
+        expected_labels = ["b", "a", "b", "a"]
+
+    torch.testing.assert_close(data.batch["obs"], expected_obs, atol=0, rtol=0)
+    assert (data.non_tensor_batch["labels"] == expected_labels).all()
+    assert data.meta_info == {"info": "test_info"}
+
+
+def test_vocab_parallel_entropy():
+    from megatron.core import parallel_state as mpu
+
+    from verl.utils.megatron.tensor_parallel import vocab_parallel_entropy
+    from verl.utils.profiler import log_gpu_memory_usage
+    from verl.utils.torch_functional import entropy_from_logits
+
+    mpu.initialize_model_parallel(
+        tensor_model_parallel_size=2, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None
+    )
+
+    batch_size = 2
+    seqlen = 128
+    vocab_size = 155136
+
+    logits = torch.randn(batch_size * seqlen, vocab_size, device=get_device_name(), requires_grad=True)
+    target = torch.randint(
+        low=0, high=vocab_size, size=(batch_size * seqlen,), device=get_device_name(), dtype=torch.int64
+    )
+
+    # broadcast across tp
+    torch.distributed.broadcast(
+        logits, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()
+    )
+    torch.distributed.broadcast(
+        target, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()
+    )
+
+    tp_rank = mpu.get_tensor_model_parallel_rank()
+    vocab_size_per_tp = vocab_size // mpu.get_tensor_model_parallel_world_size()
+
+    # get the local logits of each tp
+    vocab_parallel_logits = (
+        logits.clone().detach()[:, tp_rank * vocab_size_per_tp : (tp_rank + 1) * vocab_size_per_tp].requires_grad_()
+    )
+    logits.grad = None
+    vocab_parallel_logits.grad = None
+
+    log_gpu_memory_usage("begin")
+    output_entropy = vocab_parallel_entropy(vocab_parallel_logits)
+    log_gpu_memory_usage("after forward")
+    grad_output = torch.randn_like(output_entropy)
+    output_entropy.backward(grad_output)
+    log_gpu_memory_usage("after backward")
+
+    target_entropy = entropy_from_logits(logits)
+    torch.testing.assert_close(output_entropy, target_entropy)
+    target_entropy.backward(grad_output)
+    torch.testing.assert_close(
+        logits.grad[:, tp_rank * vocab_size_per_tp : (tp_rank + 1) * vocab_size_per_tp], vocab_parallel_logits.grad
+    )
+    # make sure logits is not altered
+    torch.testing.assert_close(
+        logits[:, tp_rank * vocab_size_per_tp : (tp_rank + 1) * vocab_size_per_tp], vocab_parallel_logits
+    )
+
+    if mpu.get_tensor_model_parallel_rank() == 0:
+        print("test_vocab_parallel_entropy passes")
+
+    mpu.destroy_model_parallel()
+
+
+if __name__ == "__main__":
+    local_rank, rank, world_size = initialize_global_process_group()
+    test_all_gather_data_proto()
+    test_vocab_parallel_entropy()
diff --git a/code/RL_model/verl/verl_train/tests/special_distributed/test_torch_functional.py b/code/RL_model/verl/verl_train/tests/special_distributed/test_torch_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07d335f5a313e6557e72e2331c88176486fc016
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_distributed/test_torch_functional.py
@@ -0,0 +1,35 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from verl.utils.torch_functional import allgather_dict_into_dict
+
+if __name__ == "__main__":
+    torch.distributed.init_process_group(backend="gloo")
+
+    local_rank = int(os.environ["LOCAL_RANK"])
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    metrics_dict = {"loss": [0 + rank, 1 + rank, 2 + rank], "grad_norm": rank}
+
+    result = allgather_dict_into_dict(data=metrics_dict, group=None)
+
+    assert result["loss"] == [[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5]]
+    assert result["grad_norm"] == [0, 1, 2, 3]
+
+    print(result)
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_api_docs.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_api_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..03756f2d284ddcb58b41e068b4abd560b2d074f7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_api_docs.py
@@ -0,0 +1,142 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fail CI if any function or class that is publicly exported via
+``__all__`` lacks a docstring.
+
+Usage
+-----
+  # Check specific modules or packages
+  python check_docstrings.py mypkg.core mypkg.utils
+
+  # Check an entire source tree (all top-level packages under cwd)
+  python check_docstrings.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import inspect
+import pkgutil
+import sys
+from pathlib import Path
+from types import ModuleType
+from typing import Iterable
+
+_ALLOW_LIST = [
+    "verl.third_party.vllm.LLM",
+    "verl.third_party.vllm.parallel_state",
+    "verl.utils.profiler.WorkerProfiler",
+    "verl.utils.profiler.WorkerProfilerExtension",
+    "verl.utils.profiler.log_gpu_memory_usage",
+    "verl.utils.profiler.log_print",
+    "verl.utils.profiler.mark_annotate",
+    "verl.utils.profiler.mark_end_range",
+    "verl.utils.profiler.mark_start_range",
+    "verl.models.mcore.qwen2_5_vl.get_vision_model_config",
+    "verl.models.mcore.qwen2_5_vl.get_vision_projection_config",
+    "verl.models.mcore.mbridge.freeze_moe_router",
+    "verl.models.mcore.mbridge.make_value_model",
+    "verl.utils.transformers_compat.flash_attn_supports_top_left_mask",
+]
+
+
+def iter_submodules(root: ModuleType) -> Iterable[ModuleType]:
+    """Yield *root* and every sub-module inside it."""
+    yield root
+
+    def print_pkg_error(pkg_name):
+        print(f"[warn] Skipping {pkg_name!r}", file=sys.stderr)
+
+    if getattr(root, "__path__", None):  # only packages have __path__
+        for mod_info in pkgutil.walk_packages(root.__path__, prefix=f"{root.__name__}.", onerror=print_pkg_error):
+            try:
+                yield importlib.import_module(mod_info.name)
+            except Exception as exc:
+                print(f"[warn] Skipping {mod_info.name!r}: {exc}", file=sys.stderr)
+
+
+def names_missing_doc(mod: ModuleType) -> list[str]:
+    """Return fully-qualified names that need docstrings."""
+    missing: list[str] = []
+    public = getattr(mod, "__all__", [])
+    for name in public:
+        obj = getattr(mod, name, None)
+        if f"{mod.__name__}.{name}" in _ALLOW_LIST:
+            continue
+        if obj is None:
+            # Exported but not found in the module: flag it anyway.
+            missing.append(f"{mod.__name__}.{name}  (not found)")
+            continue
+
+        if inspect.isfunction(obj) or inspect.isclass(obj):
+            doc = inspect.getdoc(obj)
+            if not doc or not doc.strip():
+                missing.append(f"{mod.__name__}.{name}")
+    return missing
+
+
+def check_module(qualname: str) -> list[str]:
+    """Import *qualname* and check it (and sub-modules)."""
+    try:
+        module = importlib.import_module(qualname)
+    except ModuleNotFoundError as exc:
+        print(f"[error] Cannot import '{qualname}': {exc}", file=sys.stderr)
+        return [qualname]
+
+    missing: list[str] = []
+    for submod in iter_submodules(module):
+        missing.extend(names_missing_doc(submod))
+    return missing
+
+
+def autodiscover_packages() -> list[str]:
+    """Detect top-level packages under CWD when no argument is given."""
+    pkgs: list[str] = []
+    for p in Path.cwd().iterdir():
+        if p.is_dir() and (p / "__init__.py").exists():
+            pkgs.append(p.name)
+    return pkgs
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "modules",
+        nargs="*",
+        help="Fully-qualified module or package names (defaults to every top-level package found in CWD).",
+    )
+    args = parser.parse_args()
+
+    targets = args.modules or autodiscover_packages()
+    if not targets:
+        raise ValueError("[error] No modules specified and none detected automatically.")
+
+    all_missing: list[str] = []
+    for modname in targets:
+        all_missing.extend(check_module(modname))
+
+    if all_missing:
+        print("\nMissing docstrings:")
+        for name in sorted(all_missing):
+            print(f"  - {name}")
+        raise ValueError("Missing docstrings detected. Please enhance them with docs accordingly.")
+
+    print("✅ All exported functions/classes have docstrings.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_dataproto_usage.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_dataproto_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c8521ab12e0fc2f39dd965d3aefbb4f303c12c9
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_dataproto_usage.py
@@ -0,0 +1,69 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This CI test is used for checking whether DataProto is used in the code of some directory
+"""
+
+import os
+from argparse import ArgumentParser
+from pathlib import Path
+
+SEARCH_WHITELIST = []
+
+SEARCH_KEYWORDS = ["DataProto"]
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--directory", "-d", required=True, type=str)
+    args = parser.parse_args()
+    directory_in_str = args.directory
+
+    pathlist = Path(directory_in_str).glob("**/*.py")
+    for path in pathlist:
+        path_in_str = str(path.absolute())
+
+        # judge whether current path is in pre-defined search whitelist or not.
+        path_in_whitelist = False
+
+        for sw in SEARCH_WHITELIST:
+            # for easy debugging in non-linux system
+            sw = sw.replace("/", os.sep)
+            if sw in path_in_str:
+                print(f"[SKIP] File {path_in_str} is in device api usage check whitelist, checking is skipped.")
+                path_in_whitelist = True
+                break
+
+        if path_in_whitelist:
+            continue
+
+        with open(path_in_str, encoding="utf-8") as f:
+            file_content = f.read()
+
+            find_invalid_device_management = False
+
+            for sk in SEARCH_KEYWORDS:
+                if sk in file_content:
+                    find_invalid_device_management = True
+                    break
+
+            print(
+                f"[CHECK] File {path_in_str} is detected for DataProto usage check, check result: "
+                f"{'success' if not find_invalid_device_management else f'failed, because detect {sk}'}."
+            )
+
+            assert not find_invalid_device_management, (
+                f"file {path_in_str} contains DataProto usage, please use TensorDict directly!"
+            )
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_device_api_usage.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_device_api_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf9cf7e75a0cff068d87e1d369d8f7600306db1
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_device_api_usage.py
@@ -0,0 +1,107 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This CI test is used for checking whether device api usage is irregular, suggest using api in `verl/utils/device.py`.
+Search targets include .py files in verl/recipe and verl/verl.
+Some files that must contain ".cuda", "cuda" or "nccl" keyword is pre-defined in whitelist below.
+"""
+
+import os
+from argparse import ArgumentParser
+from pathlib import Path
+
+# directory or file path must contain keyword ".cuda" or "cuda"
+CUDA_KEYWORD_CHECK_WHITELIST = [
+    "verl/utils/device.py",
+    "verl/utils/torch_functional.py",  # import flash_attn only on cuda
+    "verl/utils/profiler/nvtx_profile.py",  # appear in NsightSystemsProfiler
+    "verl/utils/profiler/torch_profile.py",  # appear in TorchProfiler
+    "verl/utils/profiler/config.py",  # appear in TorchProfilerToolConfig
+    "verl/utils/kernel/linear_cross_entropy.py",  # appear in nvidia nvtx
+    "verl/utils/rendezvous/ray_backend.py",  # appear in cupy importance
+    "verl/single_controller/ray/base.py",  # appear in default device_name
+    "verl/trainer/ppo/ray_trainer.py",  # appear in default device_name
+    "verl/experimental/transfer_queue/ray_trainer.py",  # appear in docstring as default device_name
+    "verl/experimental/one_step_off_policy/ray_trainer.py",  # appear in docstring as default device_name
+    "verl/utils/reward_score/sandbox_fusion/utils.py",  # appear in sandbox language type
+    "verl/workers/reward_model/megatron/reward_model.py",  # appear in default device_name
+    "verl/third_party/torch/distributed/_state_dict_utils.py",  # torch monkey patch fixes
+    "verl/third_party/torch/distributed/checkpoint/state_dict.py",  # torch monkey patch fixes
+    "verl/workers/engine/base.py",  # appear in default device_name
+    "verl/workers/engine/utils.py",  # appear in enable_full_determinism
+    "verl/workers/engine/fsdp/transformer_impl.py",  # appear in default device_name
+    "verl/workers/engine/veomni/transformer_impl.py",  # appear in default device_name
+    "verl/workers/rollout/vllm_rollout/vllm_async_server.py",  # appear in config.cudagraph_capture_sizes
+    "verl/workers/rollout/sglang_rollout/async_sglang_server.py",  # manually set CUDA_VISIBLE_DEVICES
+    "verl/workers/rollout/trtllm_rollout/trtllm_async_server.py",  # appear in config.cudagraph_capture_sizes
+    "verl/workers/rollout/replica.py",  # appear in default device_name
+    "verl/checkpoint_engine",  # checkpoint engine backend are device specific
+]
+
+# directory or file path must contain keyword "nccl"
+NCCL_KEYWORD_CHECK_WHITELIST = [
+    "verl/utils/device.py",
+    "verl/third_party/sglang/parallel_state.py",  # appear in default backend
+    "verl/recipe/fully_async_policy/param_sync.py",  # fully_async_policy in default backend
+]
+
+SEARCH_WHITELIST = CUDA_KEYWORD_CHECK_WHITELIST + NCCL_KEYWORD_CHECK_WHITELIST
+
+SEARCH_KEYWORDS = [".cuda", '"cuda"', '"nccl"']
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--directory", "-d", required=True, type=str)
+    args = parser.parse_args()
+    directory_in_str = args.directory
+
+    pathlist = Path(directory_in_str).glob("**/*.py")
+    for path in pathlist:
+        path_in_str = str(path.absolute())
+
+        # judge whether current path is in pre-defined search whitelist or not.
+        path_in_whitelist = False
+
+        for sw in SEARCH_WHITELIST:
+            # for easy debugging in non-linux system
+            sw = sw.replace("/", os.sep)
+            if sw in path_in_str:
+                print(f"[SKIP] File {path_in_str} is in device api usage check whitelist, checking is skipped.")
+                path_in_whitelist = True
+                break
+
+        if path_in_whitelist:
+            continue
+
+        with open(path_in_str, encoding="utf-8") as f:
+            file_content = f.read()
+
+            find_invalid_device_management = False
+
+            for sk in SEARCH_KEYWORDS:
+                if sk in file_content:
+                    find_invalid_device_management = True
+                    break
+
+            print(
+                f"[CHECK] File {path_in_str} is detected for device api usage check, check result: "
+                f"{'success' if not find_invalid_device_management else f'failed, because detect {sk}'}."
+            )
+
+            assert not find_invalid_device_management, (
+                f'file {path_in_str} contains .cuda/"cuda"/"nccl" usage, please use api in '
+                f"verl/utils/device.py directly."
+            )
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_docs_time_info.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_docs_time_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..a54d1d50a7e9d21202387e2c9c8e3c6c73a5d807
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_docs_time_info.py
@@ -0,0 +1,84 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Check that every .md and .rst file under docs/ contains the substring "Last updated",
+with an allow-list for exceptions.
+"""
+
+import sys
+from pathlib import Path
+
+# === CONFIGURATION ===
+
+# Relative paths (to docs/) or glob patterns to skip checking
+ALLOW_LIST = {
+    "docs/README.md",  # you can list individual files
+    "docs/legacy/*.rst",  # or glob patterns
+    "docs/index.rst",
+    "docs/start/install.rst",
+    "docs/start/quickstart.rst",
+    "docs/README_vllm0.7.md",
+}
+
+# The folder to scan
+DOCS_DIR = Path("docs")
+
+# === SCRIPT ===
+
+
+def is_allowed(path: Path) -> bool:
+    """
+    Return True if `path` matches any entry in ALLOW_LIST.
+    """
+    rel = str(path)
+    for pattern in ALLOW_LIST:
+        if Path(rel).match(pattern):
+            return True
+    return False
+
+
+def main():
+    if not DOCS_DIR.exists():
+        print(f"Error: Documentation directory '{DOCS_DIR}' does not exist.", file=sys.stderr)
+        sys.exit(1)
+
+    missing = []
+
+    # Gather all .md and .rst files under docs/
+    for ext in ("*.md", "*.rst"):
+        for path in DOCS_DIR.rglob(ext):
+            if is_allowed(path):
+                continue
+
+            text = path.read_text(encoding="utf-8", errors="ignore")
+            if "Last updated" not in text:
+                missing.append(path)
+
+    # Report
+    if missing:
+        print("\nThe following files are missing the 'Last updated' string:\n")
+        for p in missing:
+            print(f"  - {p}")
+        print(f"\nTotal missing: {len(missing)}\n", file=sys.stderr)
+        raise AssertionError(
+            "Some documentation files lack a 'Last updated' line. Please include info such as "
+            "'Last updated: mm/dd/yyyy' to indicate the last update time of the document."
+        )
+    else:
+        print("✅ All checked files contain 'Last updated'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_docstrings.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_docstrings.py
new file mode 100644
index 0000000000000000000000000000000000000000..222ebef4997588257ebdf2e6ad88964ebcba78fc
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_docstrings.py
@@ -0,0 +1,156 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Python script to check docstrings for functions and classes in specified files.
+Checks that every public function and class has proper docstring documentation.
+"""
+
+import ast
+import os
+import sys
+
+
+class DocstringChecker(ast.NodeVisitor):
+    """AST visitor to check for missing docstrings in functions and classes."""
+
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.missing_docstrings: list[tuple[str, str, int]] = []
+        self.current_class = None
+        self.function_nesting_level = 0
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        """Visit function definitions and check for docstrings."""
+        if not node.name.startswith("_") and self.function_nesting_level == 0:
+            if not self._has_docstring(node):
+                func_name = f"{self.current_class}.{node.name}" if self.current_class else node.name
+                self.missing_docstrings.append((func_name, self.filename, node.lineno))
+
+        self.function_nesting_level += 1
+        self.generic_visit(node)
+        self.function_nesting_level -= 1
+
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
+        """Visit async function definitions and check for docstrings."""
+        if not node.name.startswith("_") and self.function_nesting_level == 0:
+            if not self._has_docstring(node):
+                func_name = f"{self.current_class}.{node.name}" if self.current_class else node.name
+                self.missing_docstrings.append((func_name, self.filename, node.lineno))
+
+        self.function_nesting_level += 1
+        self.generic_visit(node)
+        self.function_nesting_level -= 1
+
+    def visit_ClassDef(self, node: ast.ClassDef):
+        """Visit class definitions and check for docstrings."""
+        if not node.name.startswith("_"):
+            if not self._has_docstring(node):
+                self.missing_docstrings.append((node.name, self.filename, node.lineno))
+
+        old_class = self.current_class
+        self.current_class = node.name
+        self.generic_visit(node)
+        self.current_class = old_class
+
+    def _has_docstring(self, node) -> bool:
+        """Check if a node has a docstring."""
+        return ast.get_docstring(node) is not None
+
+
+def check_file_docstrings(filepath: str) -> list[tuple[str, str, int]]:
+    """Check docstrings in a single file."""
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            content = f.read()
+
+        tree = ast.parse(content, filename=filepath)
+        checker = DocstringChecker(filepath)
+        checker.visit(tree)
+        return checker.missing_docstrings
+
+    except Exception as e:
+        print(f"Error processing {filepath}: {e}")
+        return []
+
+
+def main():
+    """Main function to check docstrings in specified files."""
+
+    files_to_check = [
+        "verl/trainer/ppo/ray_trainer.py",
+        "verl/trainer/main_ppo.py",
+        "verl/trainer/ppo/reward.py",
+        "verl/utils/reward_score/__init__.py",
+        "verl/trainer/ppo/core_algos.py",
+        "verl/experimental/agent_loop/agent_loop.py",
+        "verl/workers/sharding_manager/fsdp_vllm.py",
+        "verl/workers/sharding_manager/fsdp_ulysses.py",
+    ]
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_path = os.path.dirname(os.path.dirname(script_dir))
+
+    if not os.path.exists(repo_path):
+        print(f"Repository path {repo_path} does not exist!")
+        sys.exit(1)
+
+    os.chdir(repo_path)
+
+    all_missing_docstrings = []
+
+    print("Checking docstrings in specified files...")
+    print("=" * 60)
+
+    for file_path in files_to_check:
+        if not os.path.exists(file_path):
+            print(f"Warning: File {file_path} does not exist!")
+            continue
+
+        print(f"Checking {file_path}...")
+        missing = check_file_docstrings(file_path)
+        all_missing_docstrings.extend(missing)
+
+        if missing:
+            print(f"  Found {len(missing)} missing docstrings")
+        else:
+            print("  All functions and classes have docstrings [OK]")
+
+    print("=" * 60)
+
+    if all_missing_docstrings:
+        print(f"\nSUMMARY: Found {len(all_missing_docstrings)} functions/classes missing docstrings:")
+        print("-" * 60)
+
+        by_file = {}
+        for name, filepath, lineno in all_missing_docstrings:
+            if filepath not in by_file:
+                by_file[filepath] = []
+            by_file[filepath].append((name, lineno))
+
+        for filepath in sorted(by_file.keys()):
+            print(f"\n{filepath}:")
+            for name, lineno in sorted(by_file[filepath], key=lambda x: x[1]):
+                print(f"  - {name} (line {lineno})")
+
+        print(f"\nTotal missing docstrings: {len(all_missing_docstrings)}")
+
+        raise Exception(f"Found {len(all_missing_docstrings)} functions/classes without proper docstrings!")
+
+    else:
+        print("\n[OK] All functions and classes have proper docstrings!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_license.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_license.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cfa256b5f913841af65ac99975c52fe20ca3103
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_license.py
@@ -0,0 +1,88 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Iterable
+
+license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates"
+license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates"
+license_head_bytedance_26 = "Copyright 2026 Bytedance Ltd. and/or its affiliates"
+# Add custom license headers below
+license_head_prime = "Copyright 2024 PRIME team and/or its affiliates"
+license_head_individual = "Copyright 2025 Individual Contributor:"
+license_head_sglang = "Copyright 2023-2024 SGLang Team"
+license_head_modelbest = "Copyright 2025 ModelBest Inc. and/or its affiliates"
+license_head_amazon = "Copyright 2025 Amazon.com Inc and/or its affiliates"
+license_head_facebook = "Copyright (c) 2016-     Facebook, Inc"
+license_head_meituan = "Copyright 2025 Meituan Ltd. and/or its affiliates"
+license_head_huawei = "Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved."
+license_headers = [
+    license_head_bytedance,
+    license_head_bytedance_25,
+    license_head_bytedance_26,
+    license_head_prime,
+    license_head_individual,
+    license_head_sglang,
+    license_head_modelbest,
+    license_head_amazon,
+    license_head_facebook,
+    license_head_meituan,
+    license_head_huawei,
+]
+
+
+def get_py_files(path_arg: Path) -> Iterable[Path]:
+    """get py files under a dir. if already py file return it
+
+    Args:
+        path_arg (Path): path to scan for py files
+
+    Returns:
+        Iterable[Path]: list of py files
+    """
+    if path_arg.is_dir():
+        return path_arg.glob("**/*.py")
+    elif path_arg.is_file() and path_arg.suffix == ".py":
+        return [path_arg]
+    return []
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--directories",
+        "-d",
+        required=True,
+        type=Path,
+        nargs="+",
+        help="List of directories to check for license headers",
+    )
+    args = parser.parse_args()
+
+    # Collect all Python files from specified directories
+    pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg))
+
+    for path in pathlist:
+        # because path is object not string
+        path_in_str = str(path.absolute())
+        print(path_in_str)
+        with open(path_in_str, encoding="utf-8") as f:
+            file_content = f.read()
+
+            has_license = False
+            for lh in license_headers:
+                if lh in file_content:
+                    has_license = True
+                    break
+            assert has_license, f"file {path_in_str} does not contain license"
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_description.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_description.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ed4563db6088e8562273cebd08116e375bc8bb2
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_description.py
@@ -0,0 +1,97 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+import json
+import os
+
+# Number of lines to check
+NUM_LINES = 5
+
+
+# Custom exception types for clear error handling
+class TemplateFileError(Exception):
+    pass
+
+
+class PRBodyLoadError(Exception):
+    pass
+
+
+class PRDescriptionError(Exception):
+    pass
+
+
+# Path to the PR template file
+template_file = os.path.join(os.getenv("GITHUB_WORKSPACE", "."), ".github", "PULL_REQUEST_TEMPLATE.md")
+
+
+def load_template(path):
+    """
+    Load only the first NUM_LINES of the PR template file as a list of lines,
+    without stripping any characters.
+    """
+    lines = []
+    try:
+        with open(path, encoding="utf-8") as f:
+            for _ in range(NUM_LINES):
+                line = f.readline()
+                if not line:
+                    break
+                lines.append(line.strip())
+        return lines
+    except Exception as e:
+        raise TemplateFileError(f"Failed to read PR template (first {NUM_LINES} lines) at {path}: {e}") from e
+
+
+def load_pr_body(event_path):
+    try:
+        with open(event_path, encoding="utf-8") as f:
+            payload = json.load(f)
+        return payload.get("pull_request", {}).get("body", "") or ""
+    except Exception as e:
+        raise PRBodyLoadError(f"Failed to read PR body from {event_path}: {e}") from e
+
+
+def check_pr_description(body, template_lines):
+    """
+    Compare the first NUM_LINES lines of the PR body to the template lines.
+    If they match exactly, the placeholder was not modified.
+    """
+    pr_lines = body.splitlines(keepends=True)
+    pr_first = [x.strip() for x in pr_lines[:NUM_LINES]]
+    if pr_first == template_lines:
+        raise PRDescriptionError(
+            "It looks like you haven't updated the '### What does this PR do?' section. Please replace "
+            "the placeholder text with a concise description of what your PR does."
+        )
+    else:
+        print(pr_first)
+        print(template_lines)
+
+
+def main():
+    event_path = os.getenv("GITHUB_EVENT_PATH")
+    if not event_path:
+        raise OSError("GITHUB_EVENT_PATH is not set.")
+
+    template_lines = load_template(template_file)
+    pr_body = load_pr_body(event_path)
+    check_pr_description(pr_body, template_lines)
+
+    print("✅ '### What does this PR do?' section has been filled out.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_title.py b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_title.py
new file mode 100644
index 0000000000000000000000000000000000000000..1153d9d77afa1c656cc5c8d9528a2016be002c1e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/check_pr_title.py
@@ -0,0 +1,72 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+
+# Get PR title from environment
+pr_title = os.environ.get("PR_TITLE", "").strip()
+
+# Define rules
+allowed_modules = ["fsdp", "megatron", "veomni", "sglang", "vllm", "trtllm", "rollout", "trainer"]
+allowed_modules += ["tests", "training_utils", "recipe", "hardware", "deployment"]
+allowed_modules += ["ray", "worker", "single_controller", "misc", "docker", "ci"]
+allowed_modules += ["perf", "model", "algo", "env", "tool", "ckpt", "doc", "data", "cfg", "reward"]
+allowed_types = ["feat", "fix", "refactor", "chore", "test"]
+
+# Check for [1/N] prefix and extract the rest of the title
+progress_match = re.match(r"^\[\d/[\dNn]\]\s*(.+)$", pr_title, re.IGNORECASE)
+if progress_match:
+    pr_title = progress_match.group(1).strip()
+
+# Check for [BREAKING] prefix and extract the rest of the title
+breaking_match = re.match(r"^\[BREAKING\]\s*(.+)$", pr_title, re.IGNORECASE)
+if breaking_match:
+    core_pr_title = breaking_match.group(1).strip()
+    is_breaking = True
+else:
+    core_pr_title = pr_title
+    is_breaking = False
+
+# Build dynamic regex pattern for modules (now working on core_pr_title)
+re_modules_pattern = re.compile(r"^\[([a-z_,\s]+)\]", re.IGNORECASE)
+re_modules = re_modules_pattern.match(core_pr_title)
+if not re_modules:
+    print(f"❌ Invalid PR title: '{pr_title}'")
+    print("Expected format: [BREAKING][module] type: description")
+    print(f"Allowed modules: {', '.join(allowed_modules)}")
+    raise Exception("Invalid PR title")
+else:
+    modules = re.findall(r"[a-z_]+", re_modules.group(1).lower())
+    if not all(module in allowed_modules for module in modules):
+        invalid_modules = [module for module in modules if module not in allowed_modules]
+        print(f"❌ Invalid modules: {', '.join(invalid_modules)}")
+        print(f"Allowed modules: {', '.join(allowed_modules)}")
+        raise Exception("Invalid PR title")
+
+types_pattern = "|".join(re.escape(t) for t in allowed_types)
+re_types_pattern = re.compile(rf"^\[[a-z_,\s]+\]\s+({types_pattern}):\s+.+$", re.IGNORECASE)
+match = re_types_pattern.match(core_pr_title)
+
+if not match:
+    print(f"❌ Invalid PR title: '{pr_title}'")
+    print("Expected format: [BREAKING][module] type: description")
+    print(f"Allowed types: {', '.join(allowed_types)}")
+    raise Exception("Invalid PR title")
+
+change_type = match.group(1).lower()
+
+# Build the success message
+breaking_info = " (BREAKING CHANGE)" if is_breaking else ""
+print(f"✅ PR title is valid: {pr_title}, modules: {modules}, type: {change_type}{breaking_info}")
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/test_config_docs.py b/code/RL_model/verl/verl_train/tests/special_sanity/test_config_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8dc74762450fe41a42db6ca09972851e8dcbdc2
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/test_config_docs.py
@@ -0,0 +1,88 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from pathlib import Path
+
+
+def validate_yaml_format(yaml_lines):
+    errors = []
+    i = 0
+
+    while i < len(yaml_lines):
+        line = yaml_lines[i]
+        stripped = line.strip()
+
+        # Skip empty lines
+        if stripped == "":
+            i += 1
+            continue
+
+        # Match YAML keys like "field:" or "field: value"
+        key_match = re.match(r"^(\s*)([a-zA-Z0-9_]+):", line)
+        if key_match:
+            # Check if there's a comment above
+            if i == 0 or not yaml_lines[i - 1].strip().startswith("#"):
+                errors.append(f"Missing comment above line {i + 1}: {line.strip()}")
+
+            # Check for inline comment
+            if "#" in line and not stripped.startswith("#"):
+                comment_index = line.index("#")
+                colon_index = line.index(":")
+                if comment_index > colon_index:
+                    errors.append(f"Inline comment found on line {i + 1}: {line.strip()}")
+
+            # Check for blank line after this key line (unless next is a deeper indent)
+            if i + 1 < len(yaml_lines):
+                next_line = yaml_lines[i + 1]
+                next_stripped = next_line.strip()
+
+                # If next is not empty and not a deeper nested line, enforce blank line
+                if next_stripped != "":
+                    errors.append(f"Missing blank line after line {i + 1}: {line.strip()}")
+
+        i += 1
+
+    return errors
+
+
+def test_trainer_config_doc():
+    yamls_to_inspect = [
+        "verl/trainer/config/ppo_trainer.yaml",
+        "verl/trainer/config/actor/actor.yaml",
+        "verl/trainer/config/actor/dp_actor.yaml",
+        "verl/trainer/config/critic/critic.yaml",
+        "verl/trainer/config/critic/dp_critic.yaml",
+        "verl/trainer/config/ref/ref.yaml",
+        "verl/trainer/config/ref/dp_ref.yaml",
+        "verl/trainer/config/rollout/rollout.yaml",
+    ]
+    success = True
+    for yaml_to_inspect in yamls_to_inspect:
+        yaml_path = Path(yaml_to_inspect)  # path to your YAML file
+        with open(yaml_path) as f:
+            lines = f.readlines()
+
+        validation_errors = validate_yaml_format(lines)
+        if validation_errors:
+            success = False
+            print("YAML documentation format check failed:")
+            print(f"Please read the top block of {yaml_to_inspect} to see format rules:\n")
+            for err in validation_errors:
+                print(" -", err)
+
+    if not success:
+        raise Exception("Please fix documentation format.")
+    else:
+        print("YAML format check passed ✅")
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/test_import.py b/code/RL_model/verl/verl_train/tests/special_sanity/test_import.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f8a918fe65679c353e8055c2c2b0a428fdf8f7a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/test_import.py
@@ -0,0 +1,25 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_import():
+    import verl
+
+    print(verl.__version__)
+
+
+def test_single_controller_import():
+    import verl.single_controller
+
+    print(verl.single_controller.__version__)
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/type_coverage_check.py b/code/RL_model/verl/verl_train/tests/special_sanity/type_coverage_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..c35abaeb2d6d4fb23f0d2c58b4dde56986932a37
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/type_coverage_check.py
@@ -0,0 +1,182 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Custom type annotation check tool.
+To inspect the type annotation for functions in the entire codebase, please run:
+find verl -type f -name "*.py" | xargs -n 1 python3 tests/special_sanity/type_coverage_check.py --all-lines
+--debug --target-file
+"""
+
+import argparse
+import ast
+import linecache
+import subprocess
+from pathlib import Path
+
+
+def get_changed_files() -> list[Path]:
+    result = subprocess.run(
+        ["git", "diff", "--name-only", "--diff-filter=AM", "origin/main...HEAD"], stdout=subprocess.PIPE, text=True
+    )
+    return [Path(f) for f in result.stdout.splitlines() if f.endswith(".py")]
+
+
+def get_changed_lines(file_path: Path) -> set[int]:
+    result = subprocess.run(
+        ["git", "diff", "-U0", "origin/main...HEAD", "--", str(file_path)],
+        stdout=subprocess.PIPE,
+        text=True,
+    )
+    lines: set[int] = set()
+    for line in result.stdout.splitlines():
+        if line.startswith("@@"):
+            for part in line.split():
+                try:
+                    if part.startswith("+") and "," in part:
+                        start, count = map(int, part[1:].split(","))
+                        lines.update(range(start, start + count))
+                    elif part.startswith("+") and "," not in part:
+                        lines.add(int(part[1:]))
+                except Exception:
+                    # (vermouth1992) There are many edge cases here because + can be in the changed program
+                    pass
+    return lines
+
+
+CHECK_SUCCESS = 0
+CHECK_WARNING = 1
+CHECK_FAILURE = -1
+
+
+def should_check_type(arg_name: str) -> bool:
+    if arg_name in ("self", "cls"):
+        return False
+    if arg_name.startswith("*"):
+        return False
+    return True
+
+
+def has_type_annotations(node: ast.AST, debug: bool = False) -> int:
+    if isinstance(node, ast.FunctionDef):
+        is_private = node.name.startswith("_")
+        if node.args.vararg is not None or node.args.kwarg is not None:
+            return CHECK_SUCCESS
+        has_ann = (
+            all(arg.annotation is not None for arg in node.args.args if should_check_type(arg.arg))
+            and node.returns is not None
+        )
+        if has_ann or is_private:
+            return CHECK_SUCCESS
+        else:
+            if debug:
+                print(node, [(arg.annotation, arg.arg) for arg in node.args.args if should_check_type(arg.arg)])
+            return CHECK_FAILURE
+    return CHECK_SUCCESS
+
+
+def check_file(
+    file_path: Path, changed_lines: set[int], debug: bool = False
+) -> tuple[int, int, list[tuple[Path, int, str]], list[tuple[Path, int, str]]]:
+    with open(file_path) as f:
+        source: str = f.read()
+    tree = ast.parse(source, filename=str(file_path))
+    annotated = 0
+    total = 0
+    warning_lines: list[tuple[Path, int, str]] = []
+    failure_lines: list[tuple[Path, int, str]] = []
+
+    for node in ast.walk(tree):
+        if hasattr(node, "lineno") and node.lineno in changed_lines:
+            if isinstance(node, ast.FunctionDef | ast.Assign | ast.AnnAssign):
+                total += 1
+                result = has_type_annotations(node, debug)
+                if result == CHECK_SUCCESS or result == CHECK_WARNING:
+                    annotated += 1
+                    if result == CHECK_WARNING:
+                        warning_lines.append(
+                            (file_path, node.lineno, linecache.getline(str(file_path), node.lineno).strip())
+                        )
+                else:
+                    source_line = linecache.getline(str(file_path), node.lineno).strip()
+                    failure_lines.append((file_path, node.lineno, source_line))
+
+    return annotated, total, warning_lines, failure_lines
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--threshold", type=float, default=0.3, help="Minimum ratio of annotated lines required (0.0 - 1.0)"
+    )
+    parser.add_argument("--target-file", type=str, default=None, help="Path to the Python source file to analyse")
+    parser.add_argument(
+        "--all-lines",
+        action="store_true",
+        help="Check all lines in the file instead of only changed lines based on git",
+    )
+    parser.add_argument("--debug", action="store_true", help="Add debugging logs")
+    args = parser.parse_args()
+
+    total_changed = 0
+    total_annotated = 0
+    all_warnings: list[tuple[Path, int, str]] = []
+    all_failures: list[tuple[Path, int, str]] = []
+
+    target_files = [args.target_file] if args.target_file is not None else get_changed_files()
+    for fpath in target_files:
+        if "tests/" in str(fpath):
+            continue
+        if args.all_lines:
+            changed_lines = [i + 1 for i in range(len(open(fpath).readlines()))]
+        else:
+            changed_lines = get_changed_lines(fpath)
+        annotated, total, warning_lines, failure_lines = check_file(fpath, changed_lines, args.debug)
+        total_annotated += annotated
+        total_changed += total
+        all_warnings.extend(warning_lines)
+        all_failures.extend(failure_lines)
+
+    ratio = (total_annotated / total_changed) if total_changed else 1.0
+
+    print(
+        f"🔍 Type coverage on {'all' if args.all_lines else 'changed'} lines: "
+        f"{total_annotated}/{total_changed} = {ratio:.2%}. Files inspected: {target_files}"
+    )
+
+    if all_warnings:
+        print("\n⚠️ Suggest Improve: Lines missing type annotations for inputs and outputs:\n")
+        for fname, lineno, line in all_warnings:
+            print(f"{fname}:{lineno}: {line}")
+
+    if all_failures:
+        print("⚠️ [ERROR] Lines missing type annotations for inputs and outputs:\n")
+        for fname, lineno, line in all_failures:
+            print(f"{fname}:{lineno}: {line}")
+
+    if ratio < args.threshold:
+        print(
+            f"Please add type annotations for inputs and outputs to meet threshold {args.threshold}. "
+            f"Cases exempt from checking:"
+        )
+        print("1. Private methods.")
+        print("2. Args with name in ('self', 'cls'), or *args / **kwargs")
+        print("3. Files under tests/")
+        raise Exception(f"\n❌ Type coverage below threshold ({args.threshold:.0%}).")
+    else:
+        if all_warnings or all_failures:
+            print("")
+        print("✅ Type annotation coverage acceptable.\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/validate_imported_docs.py b/code/RL_model/verl/verl_train/tests/special_sanity/validate_imported_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36a407be77a777cd72a4abf8ce4727d375eb548
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/validate_imported_docs.py
@@ -0,0 +1,130 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+verify_imported_docs.py
+
+Assert that every function or class *explicitly imported* (via
+`from <module> import <name>`) in a given Python file has a docstring.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import importlib
+import inspect
+import pathlib
+import sys
+
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Verify that imported functions/classes have docstrings.")
+    p.add_argument(
+        "--target-file",
+        default="verl/trainer/ppo/ray_trainer.py",
+        help="Path to the Python source file to analyse (e.g. verl/trainer/ppo/ray_trainer.py)",
+    )
+    p.add_argument(
+        "--allow-list",
+        default=["omegaconf.open_dict"],
+        help="a list of third_party dependencies that do not have proper docs :(",
+    )
+    p.add_argument(
+        "--project-root",
+        default=".",
+        help="Directory to prepend to PYTHONPATH so local packages resolve (default: .)",
+    )
+    p.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress success message (still prints errors).",
+    )
+    return p.parse_args()
+
+
+def _import_attr(module_name: str, attr_name: str):
+    """Import `module_name` then return `getattr(module, attr_name)`."""
+    module = importlib.import_module(module_name)
+    return getattr(module, attr_name)
+
+
+def _check_file(py_file: pathlib.Path, project_root: pathlib.Path, allow_list: list[str]) -> list[str]:
+    """Return a list of error strings (empty == success)."""
+    # Ensure local packages resolve
+    sys.path.insert(0, str(project_root.resolve()))
+
+    tree = ast.parse(py_file.read_text(), filename=str(py_file))
+    problems: list[str] = []
+
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.ImportFrom):
+            continue
+
+        # Relative imports (level > 0) get the leading dots stripped
+        module_name = "." * node.level + (node.module or "")
+        for alias in node.names:
+            if alias.name == "*":
+                problems.append(
+                    f"{py_file}:{node.lineno} - wildcard import `from {module_name} import *` cannot be verified."
+                )
+                continue
+
+            imported_name = alias.name
+
+            try:
+                obj = _import_attr(module_name, imported_name)
+            except Exception:  # pragma: no cover – wide net for import quirks
+                pass
+                # For some reason the module cannot be imported, skip for now
+                # problems.append(
+                #     f"{py_file}:{node.lineno} - could not resolve "
+                #     f"`{imported_name}` from `{module_name}` ({exc})"
+                # )
+                continue
+
+            if f"{module_name}.{imported_name}" in allow_list:
+                continue
+            if inspect.isfunction(obj) or inspect.isclass(obj):
+                doc = inspect.getdoc(obj)
+                if not (doc and doc.strip()):
+                    kind = "class" if inspect.isclass(obj) else "function"
+                    problems.append(
+                        f"{py_file}:{node.lineno} - {kind} `{module_name}.{imported_name}` is missing a docstring."
+                    )
+
+    return problems
+
+
+def main() -> None:
+    args = _parse_args()
+    target_path = pathlib.Path(args.target_file).resolve()
+    project_root = pathlib.Path(args.project_root).resolve()
+
+    if not target_path.is_file():
+        raise Exception(f"❌ Target file not found: {target_path}")
+
+    errors = _check_file(target_path, project_root, args.allow_list)
+
+    if errors:
+        print("Docstring verification failed:\n")
+        print("\n".join(f" • {e}" for e in errors))
+        raise Exception("❌ Docstring verification failed.")
+
+    if not args.quiet:
+        print(f"✅ All explicitly imported functions/classes in {target_path} have docstrings.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/special_sanity/validate_structure.py b/code/RL_model/verl/verl_train/tests/special_sanity/validate_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..56136b206374ceff9c566aa1cd88d5be30f8c73b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/special_sanity/validate_structure.py
@@ -0,0 +1,122 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+"""
+Validate that test file subfolders mirror the top-level package layout.
+
+Usage examples
+--------------
+
+# Typical run (defaults: impl_root=my_project, tests_root=tests)
+python check_tests_structure.py
+
+# Custom layout and extra allowed folders
+python check_tests_structure.py \
+    --impl-root verl \
+    --tests-root tests \
+    --allow-dirs special_e2e special_sanity special_standalone special_distributed
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def discover_allowed_modules(impl_root: Path, extra: list[str]) -> set[str]:
+    """Return the set of first-level directories that tests may live under."""
+    allowed = {p.name for p in impl_root.iterdir() if p.is_dir()}
+    allowed.update(extra)
+    return allowed
+
+
+def find_violations(tests_root: Path, allowed: set[str], allowed_files: list[str]) -> list[str]:
+    """Return a list of error strings for test files in the wrong place."""
+    errors: list[str] = []
+    for test_file in tests_root.rglob("test*.py"):
+        if str(test_file) in allowed_files:
+            continue
+        rel_parts = test_file.relative_to(tests_root).parts
+        if len(rel_parts) < 2:
+            errors.append(f"{test_file}: must be inside one of {sorted(allowed)} (not at tests root)")
+            continue
+
+        first_folder = rel_parts[0]
+        if first_folder not in allowed:
+            errors.append(
+                f"{test_file}: subfolder '{first_folder}' under tests/ is not an allowed module. "
+                f"The valid ones are: {sorted(allowed)}"
+            )
+    return errors
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Check that test files follow tests/<module>/… layout.")
+    parser.add_argument(
+        "--impl-root",
+        type=Path,
+        default="verl",
+        help="Implementation root (default: my_project)",
+    )
+    parser.add_argument(
+        "--tests-root",
+        type=Path,
+        default="tests",
+        help="Root of test tree (default: tests)",
+    )
+    parser.add_argument(
+        "--allow-dirs",
+        nargs="*",
+        default=["special_e2e", "special_sanity", "special_standalone", "special_distributed"],
+        help="Extra top-level test folders that are exempt from the rule",
+    )
+    parser.add_argument(
+        "--allow-files",
+        nargs="*",
+        default=[
+            "tests/test_protocol_on_cpu.py",
+            "tests/test_base_config_on_cpu.py",
+            "tests/test_protocol_v2_on_cpu.py",
+        ],
+        help="Extra top-level test folders that are exempt from the rule",
+    )
+    args = parser.parse_args()
+
+    if not args.impl_root.is_dir():
+        raise Exception(f"Implementation root '{args.impl_root}' does not exist.")
+    if not args.tests_root.is_dir():
+        raise Exception(f"Tests root '{args.tests_root}' does not exist.")
+
+    allowed = discover_allowed_modules(args.impl_root, args.allow_dirs)
+    violations = find_violations(args.tests_root, allowed, args.allow_files)
+
+    if violations:
+        print("❌  Test layout violations found:\n", file=sys.stderr)
+        for err in violations:
+            print("  -", err, file=sys.stderr)
+
+        print(
+            f"\nGuideline:\n  Place each test file under   tests/<module_name>/…\n  where <module_name> is "
+            f"one of the top-level packages inside '{args.impl_root}', or is explicitly listed via --allow-dirs.\n",
+            file=sys.stderr,
+        )
+        raise Exception("❌  Test layout violations found.")
+
+    print("✅  Tests folder structure looks good.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/trainer/__init__.py b/code/RL_model/verl/verl_train/tests/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f79d474d156e16ae54bb3d0c8f9ae7d0e16946e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/trainer/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests for the trainer module.
+"""
diff --git a/code/RL_model/verl/verl_train/tests/workers/actor/test_special_dp_actor.py b/code/RL_model/verl/verl_train/tests/workers/actor/test_special_dp_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a039fa6e43aff7a19c9a88de00f74239d183fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/actor/test_special_dp_actor.py
@@ -0,0 +1,304 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+import torch.nn as nn
+from tensordict import TensorDict
+from transformers import AutoModelForCausalLM, Qwen3Config
+
+from verl import DataProto
+from verl.utils.device import get_device_name
+from verl.workers.actor.dp_actor import DataParallelPPOActor
+from verl.workers.config import FSDPActorConfig, OptimizerConfig
+
+
+class MockTransformerModel(nn.Module):
+    """Mock transformer model for testing DataParallelPPOActor"""
+
+    def __init__(self, vocab_size=1000, hidden_size=64):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding = nn.Embedding(vocab_size, hidden_size)
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=4, batch_first=True), num_layers=2
+        )
+        self.lm_head = nn.Linear(hidden_size, vocab_size)
+
+    def forward(self, input_ids, attention_mask=None, position_ids=None, use_cache=False, **kwargs):
+        batch_size, seq_len = input_ids.shape
+
+        embeddings = self.embedding(input_ids)
+        hidden_states = self.transformer(embeddings)
+        logits = self.lm_head(hidden_states)
+
+        class MockOutput:
+            def __init__(self, logits):
+                self.logits = logits
+
+        return MockOutput(logits)
+
+
+class TestDataParallelPPOActor(unittest.TestCase):
+    """Test DataParallelPPOActor compute_log_prob and update_policy methods"""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up distributed environment"""
+        if get_device_name() == "cuda":
+            backend_name = "nccl"
+        elif get_device_name() == "npu":
+            backend_name = "hccl"
+        else:
+            backend_name = "gloo"
+
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group(backend=backend_name, init_method="env://")
+
+        cls.rank = torch.distributed.get_rank()
+        cls.world_size = torch.distributed.get_world_size()
+
+        if get_device_name() == "cuda":
+            torch.cuda.set_device(cls.rank)
+            cls.device = torch.device(f"cuda:{cls.rank}")
+        elif get_device_name() == "npu":
+            torch.npu.set_device(cls.rank)
+            cls.device = torch.device(f"npu:{cls.rank}")
+        else:
+            cls.device = torch.device("cpu")
+
+    def setUp(self):
+        """Set up test fixtures"""
+        self.config = FSDPActorConfig(
+            strategy="fsdp2",
+            ppo_mini_batch_size=4,
+            ppo_micro_batch_size_per_gpu=2,
+            ppo_epochs=1,
+            clip_ratio=0.2,
+            entropy_coeff=0.01,
+            grad_clip=1.0,
+            use_dynamic_bsz=False,
+            use_torch_compile=False,  # Disable torch.compile for testing
+            ulysses_sequence_parallel_size=1,
+            optim=OptimizerConfig(lr=1e-6),
+            rollout_n=1,
+        )
+
+        self.mock_model = MockTransformerModel(vocab_size=1000, hidden_size=64).to(self.device)
+        self.mock_optimizer = torch.optim.Adam(self.mock_model.parameters(), lr=1e-4)
+
+        self.actor = DataParallelPPOActor(
+            config=self.config, actor_module=self.mock_model, actor_optimizer=self.mock_optimizer
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up distributed environment"""
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+
+    def _create_test_data_for_compute_log_prob(self):
+        """Create test DataProto for compute_log_prob method"""
+        batch_size = 2
+        prompt_length = 8
+        response_length = 4
+        total_length = prompt_length + response_length
+        vocab_size = 1000
+
+        input_ids = torch.randint(0, vocab_size, (batch_size, total_length)).to(self.device)
+        attention_mask = torch.ones(batch_size, total_length).to(self.device)
+        position_ids = torch.arange(total_length).unsqueeze(0).expand(batch_size, -1).to(self.device)
+        responses = input_ids[:, -response_length:]  # Last part is the response
+
+        tensor_dict = TensorDict(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "responses": responses,
+            },
+            batch_size=[batch_size],
+        )
+
+        meta_info = {"micro_batch_size": batch_size, "temperature": 1.0, "use_dynamic_bsz": False}
+
+        return DataProto(batch=tensor_dict, meta_info=meta_info)
+
+    def _create_test_data_for_update_policy(self):
+        """Create test DataProto for update_policy method"""
+        batch_size = 4  # Must match ppo_mini_batch_size
+        prompt_length = 8
+        response_length = 4
+        total_length = prompt_length + response_length
+        vocab_size = 1000
+
+        input_ids = torch.randint(0, vocab_size, (batch_size, total_length)).to(self.device)
+        attention_mask = torch.ones(batch_size, total_length).to(self.device)
+        position_ids = torch.arange(total_length).unsqueeze(0).expand(batch_size, -1).to(self.device)
+        responses = input_ids[:, -response_length:]
+        response_mask = torch.ones(batch_size, response_length).to(self.device)
+        old_log_probs = torch.randn(batch_size, response_length).to(self.device) * 0.1  # Small values
+        advantages = torch.randn(batch_size, response_length).to(self.device) * 0.5
+
+        tensor_dict = TensorDict(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "responses": responses,
+                "response_mask": response_mask,
+                "old_log_probs": old_log_probs,
+                "advantages": advantages,
+            },
+            batch_size=[batch_size],
+        )
+
+        meta_info = {"temperature": 1.0}
+
+        return DataProto(batch=tensor_dict, meta_info=meta_info)
+
+    def test_compute_log_prob(self):
+        """Test compute_log_prob method"""
+        data = self._create_test_data_for_compute_log_prob()
+
+        outputs = self.actor.compute_log_prob(data, calculate_entropy=True)
+        log_probs = outputs["log_probs"]
+        entropys = outputs["entropys"]
+
+        batch_size = data.batch["responses"].shape[0]
+        response_length = data.batch["responses"].shape[1]
+
+        self.assertIsInstance(log_probs, torch.Tensor)
+        self.assertEqual(log_probs.shape, (batch_size, response_length))
+        self.assertTrue(torch.all(torch.isfinite(log_probs)))
+
+        self.assertIsInstance(entropys, torch.Tensor)
+        self.assertEqual(entropys.shape, (batch_size, response_length))
+        self.assertTrue(torch.all(torch.isfinite(entropys)))
+        self.assertTrue(torch.all(entropys >= 0))  # Entropy should be non-negative
+
+    def test_compute_log_prob_without_entropy(self):
+        """Test compute_log_prob method without entropy calculation"""
+        data = self._create_test_data_for_compute_log_prob()
+
+        outputs = self.actor.compute_log_prob(data, calculate_entropy=False)
+        log_probs = outputs["log_probs"]
+        entropys = outputs.get("entropys", None)
+
+        batch_size = data.batch["responses"].shape[0]
+        response_length = data.batch["responses"].shape[1]
+
+        self.assertIsInstance(log_probs, torch.Tensor)
+        self.assertEqual(log_probs.shape, (batch_size, response_length))
+        self.assertTrue(torch.all(torch.isfinite(log_probs)))
+        self.assertIsNone(entropys)
+
+    def test_update_policy(self):
+        """Test update_policy method"""
+        data = self._create_test_data_for_update_policy()
+
+        metrics = self.actor.update_policy(data)
+
+        self.assertIsInstance(metrics, dict)
+
+        expected_metric_keys = [
+            "actor/pg_loss",
+            "actor/pg_clipfrac",
+            "actor/ppo_kl",
+            "actor/pg_clipfrac_lower",
+            "actor/grad_norm",
+        ]
+
+        for key in expected_metric_keys:
+            self.assertIn(key, metrics)
+            if isinstance(metrics[key], list):
+                self.assertTrue(all(torch.isfinite(torch.tensor(v)) for v in metrics[key]))
+            else:
+                self.assertIsInstance(metrics[key], (float, int))
+                self.assertTrue(torch.isfinite(torch.tensor(metrics[key])))
+
+    def test_dataparallelppoactor_initialization(self):
+        """Test DataParallelPPOActor initialization"""
+        self.assertIsNotNone(self.actor.actor_module)
+        self.assertIsNotNone(self.actor.actor_optimizer)
+        self.assertEqual(self.actor.config, self.config)
+
+        self.assertEqual(self.actor.config.strategy, "fsdp2")
+        self.assertEqual(self.actor.config.ppo_mini_batch_size, 4)
+        self.assertEqual(self.actor.config.clip_ratio, 0.2)
+
+    def test_dataparallelppoactor_with_qwen3_model(self):
+        """Test DataParallelPPOActor with real Qwen3ForCausalLM model"""
+        qwen_config = Qwen3Config(
+            vocab_size=1000,
+            hidden_size=64,
+            intermediate_size=128,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            max_position_embeddings=512,
+            torch_dtype=torch.float32,
+            use_cache=False,
+        )
+
+        with torch.device(self.device):
+            qwen_model = AutoModelForCausalLM.from_config(config=qwen_config, torch_dtype=torch.float32).to(self.device)
+
+        qwen_optimizer = torch.optim.Adam(qwen_model.parameters(), lr=1e-4)
+
+        qwen_actor = DataParallelPPOActor(config=self.config, actor_module=qwen_model, actor_optimizer=qwen_optimizer)
+
+        data = self._create_test_data_for_compute_log_prob()
+        outputs = qwen_actor.compute_log_prob(data, calculate_entropy=True)
+        log_probs = outputs["log_probs"]
+        entropys = outputs["entropys"]
+
+        batch_size = data.batch["responses"].shape[0]
+        response_length = data.batch["responses"].shape[1]
+
+        self.assertIsInstance(log_probs, torch.Tensor)
+        self.assertEqual(log_probs.shape, (batch_size, response_length))
+        self.assertTrue(torch.all(torch.isfinite(log_probs)))
+
+        self.assertIsInstance(entropys, torch.Tensor)
+        self.assertEqual(entropys.shape, (batch_size, response_length))
+        self.assertTrue(torch.all(torch.isfinite(entropys)))
+        self.assertTrue(torch.all(entropys >= 0))
+
+        policy_data = self._create_test_data_for_update_policy()
+        metrics = qwen_actor.update_policy(policy_data)
+
+        self.assertIsInstance(metrics, dict)
+
+        expected_metric_keys = [
+            "actor/pg_loss",
+            "actor/pg_clipfrac",
+            "actor/ppo_kl",
+            "actor/pg_clipfrac_lower",
+            "actor/grad_norm",
+        ]
+
+        for key in expected_metric_keys:
+            self.assertIn(key, metrics)
+            if isinstance(metrics[key], list):
+                self.assertTrue(all(torch.isfinite(torch.tensor(v)) for v in metrics[key]))
+            else:
+                self.assertIsInstance(metrics[key], (float, int))
+                self.assertTrue(torch.isfinite(torch.tensor(metrics[key])))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_actor_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_actor_config_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..464746b56ccb710f487590c992ddcea70c998663
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/config/test_actor_config_on_cpu.py
@@ -0,0 +1,256 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from verl.utils.config import omega_conf_to_dataclass
+from verl.workers.config import (
+    ActorConfig,
+    FSDPActorConfig,
+    McoreActorConfig,
+    OptimizerConfig,
+)
+
+
+class TestActorConfig(unittest.TestCase):
+    """Test the ActorConfig dataclass and its variants."""
+
+    def test_config_inheritance(self):
+        """Test that the inheritance hierarchy works correctly."""
+        megatron_dict = {
+            "_target_": "verl.workers.config.McoreActorConfig",
+            "strategy": "megatron",
+            "ppo_mini_batch_size": 256,
+            "ppo_micro_batch_size_per_gpu": 256,
+            "clip_ratio": 0.2,
+            "optim": {
+                "_target_": "verl.workers.config.McoreOptimizerConfig",
+                "lr": 0.1,
+            },
+            "rollout_n": 1,
+        }
+        fsdp_dict = {
+            "_target_": "verl.workers.config.FSDPActorConfig",
+            "strategy": "fsdp",
+            "ppo_mini_batch_size": 256,
+            "ppo_micro_batch_size_per_gpu": 256,
+            "clip_ratio": 0.2,
+            "optim": {
+                "_target_": "verl.workers.config.FSDPOptimizerConfig",
+                "lr": 0.1,
+            },
+            "rollout_n": 1,
+        }
+
+        megatron_config = omega_conf_to_dataclass(megatron_dict)
+        fsdp_config = omega_conf_to_dataclass(fsdp_dict)
+
+        self.assertIsInstance(megatron_config, ActorConfig)
+        self.assertIsInstance(fsdp_config, ActorConfig)
+
+        self.assertEqual(megatron_config.ppo_mini_batch_size, fsdp_config.ppo_mini_batch_size)
+        self.assertEqual(megatron_config.clip_ratio, fsdp_config.clip_ratio)
+
+    def test_actor_config_from_yaml(self):
+        """Test creating ActorConfig from YAML file."""
+        from hydra import compose, initialize_config_dir
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")):
+            cfg = compose(config_name="actor", overrides=["strategy=fsdp", "ppo_micro_batch_size_per_gpu=128"])
+
+        config = omega_conf_to_dataclass(cfg)
+
+        self.assertIsInstance(config, ActorConfig)
+        self.assertEqual(config.strategy, "fsdp")
+
+    def test_fsdp_actor_config_from_yaml(self):
+        """Test creating FSDPActorConfig from YAML file."""
+        from hydra import compose, initialize_config_dir
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")):
+            cfg = compose(config_name="dp_actor", overrides=["strategy=fsdp2", "ppo_micro_batch_size_per_gpu=128"])
+
+        config = omega_conf_to_dataclass(cfg)
+
+        self.assertIsInstance(config, FSDPActorConfig)
+        self.assertEqual(config.strategy, "fsdp2")
+
+    def test_megatron_actor_config_from_yaml(self):
+        """Test creating McoreActorConfig from YAML file."""
+        from hydra import compose, initialize_config_dir
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")):
+            cfg = compose(config_name="megatron_actor", overrides=["ppo_micro_batch_size_per_gpu=128"])
+
+        config = omega_conf_to_dataclass(cfg)
+
+        self.assertIsInstance(config, McoreActorConfig)
+        self.assertEqual(config.strategy, "megatron")
+
+    def test_config_get_method(self):
+        """Test the get method for backward compatibility."""
+        config_dict = {
+            "_target_": "verl.workers.config.ActorConfig",
+            "strategy": "fsdp",
+            "ppo_mini_batch_size": 256,
+            "ppo_micro_batch_size_per_gpu": 256,
+            "optim": {
+                "_target_": "verl.workers.config.OptimizerConfig",
+                "lr": 0.1,
+            },
+            "rollout_n": 1,
+        }
+        config = omega_conf_to_dataclass(config_dict)
+
+        self.assertEqual(config.get("strategy"), "fsdp")
+        self.assertEqual(config.get("ppo_mini_batch_size"), 256)
+
+        self.assertIsNone(config.get("non_existing"))
+        self.assertEqual(config.get("non_existing", "default"), "default")
+
+    def test_config_dict_like_access(self):
+        """Test dictionary-like access to config fields."""
+        config_dict = {
+            "_target_": "verl.workers.config.ActorConfig",
+            "strategy": "fsdp",
+            "ppo_mini_batch_size": 256,
+            "ppo_micro_batch_size_per_gpu": 256,
+            "optim": {
+                "_target_": "verl.workers.config.OptimizerConfig",
+                "lr": 0.1,
+            },
+            "rollout_n": 1,
+        }
+        config = omega_conf_to_dataclass(config_dict)
+
+        self.assertEqual(config["strategy"], "fsdp")
+        self.assertEqual(config["ppo_mini_batch_size"], 256)
+
+        field_names = list(config)
+        self.assertIn("strategy", field_names)
+        self.assertIn("ppo_mini_batch_size", field_names)
+
+        self.assertGreater(len(config), 0)
+
+    def test_frozen_fields_modification_raises_exception(self):
+        """Test that modifying frozen fields raises an exception."""
+        config_dict = {
+            "_target_": "verl.workers.config.ActorConfig",
+            "strategy": "fsdp",
+            "ppo_mini_batch_size": 256,
+            "ppo_micro_batch_size_per_gpu": 256,
+            "optim": {
+                "_target_": "verl.workers.config.OptimizerConfig",
+                "lr": 0.1,
+            },
+            "rollout_n": 1,
+        }
+        config = omega_conf_to_dataclass(config_dict)
+
+        with self.assertRaises(AttributeError):
+            config.strategy = "megatron"
+
+        with self.assertRaises(AttributeError):
+            config.clip_ratio = 0.5
+
+        config.ppo_mini_batch_size = 512  # This should work since it's not in frozen fields anymore
+        self.assertEqual(config.ppo_mini_batch_size, 512)
+
+    def test_actor_config_validation_exceptions(self):
+        """Test that ActorConfig.__post_init__ raises appropriate validation exceptions."""
+        optim = OptimizerConfig(lr=0.1)
+        with self.assertRaises((ValueError, AssertionError)) as cm:
+            ActorConfig(
+                strategy="fsdp",
+                loss_agg_mode="invalid-mode",
+                use_dynamic_bsz=True,
+                optim=optim,
+                ppo_micro_batch_size_per_gpu=4,
+                rollout_n=1,
+            )
+        self.assertIn("Invalid loss_agg_mode", str(cm.exception))
+
+        with self.assertRaises((ValueError, AssertionError)) as cm:
+            ActorConfig(
+                strategy="fsdp",
+                use_dynamic_bsz=False,
+                ppo_micro_batch_size=4,
+                ppo_micro_batch_size_per_gpu=2,
+                optim=optim,
+                rollout_n=1,
+            )
+        self.assertIn("You have set both", str(cm.exception))
+
+        with self.assertRaises((ValueError, AssertionError)) as cm:
+            ActorConfig(
+                strategy="fsdp",
+                use_dynamic_bsz=False,
+                ppo_micro_batch_size=None,
+                ppo_micro_batch_size_per_gpu=None,
+                optim=optim,
+                rollout_n=1,
+            )
+        self.assertIn("Please set at least one", str(cm.exception))
+
+        config = ActorConfig(
+            strategy="fsdp",
+            use_dynamic_bsz=True,
+            ppo_micro_batch_size=None,
+            ppo_micro_batch_size_per_gpu=None,
+            optim=optim,
+            rollout_n=1,
+        )
+        self.assertIsNotNone(config)  # Should not raise an exception
+
+    def test_fsdp_actor_config_validation_exceptions(self):
+        """Test that FSDPActorConfig.validate() raises appropriate validation exceptions."""
+        optim = OptimizerConfig(lr=0.1)
+        config = FSDPActorConfig(
+            strategy="fsdp",
+            ulysses_sequence_parallel_size=2,
+            use_dynamic_bsz=True,  # Skip batch size validation to focus on FSDP validation
+            optim=optim,
+            rollout_n=1,
+        )
+
+        model_config = {"use_remove_padding": False}
+        with self.assertRaises(ValueError) as cm:
+            config.validate(n_gpus=8, train_batch_size=256, model_config=model_config)
+        self.assertIn("you must enable `use_remove_padding`", str(cm.exception))
+
+    def test_actor_config_validate_method_exceptions(self):
+        """Test that ActorConfig.validate() raises appropriate validation exceptions."""
+        optim = OptimizerConfig(lr=0.1)
+        config = ActorConfig(
+            strategy="fsdp",
+            use_dynamic_bsz=False,
+            ppo_mini_batch_size=256,
+            ppo_micro_batch_size=8,
+            ppo_micro_batch_size_per_gpu=None,  # Ensure only one batch size setting is used
+            optim=optim,
+            rollout_n=1,
+        )
+
+        with self.assertRaises(ValueError) as cm:
+            config.validate(n_gpus=8, train_batch_size=128)
+        self.assertIn("train_batch_size", str(cm.exception))
+
+        with self.assertRaises(ValueError) as cm:
+            config.validate(n_gpus=16, train_batch_size=512)
+        self.assertIn("must be >= n_gpus", str(cm.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_critic_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_critic_config_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb03560e0f491c3243ce9384b48821110c720fa5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/config/test_critic_config_on_cpu.py
@@ -0,0 +1,305 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+
+import pytest
+from hydra import compose, initialize_config_dir
+
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.profiler import ProfilerConfig
+from verl.workers.config import (
+    CriticConfig,
+    FSDPCriticConfig,
+    FSDPOptimizerConfig,
+    McoreCriticConfig,
+    McoreOptimizerConfig,
+    OptimizerConfig,
+)
+
+
+@pytest.mark.skip(reason="This test is flaky when we actively load model config")
+class TestCriticConfig:
+    """Test suite for critic configuration dataclasses."""
+
+    @pytest.fixture
+    def config_dir(self):
+        """Get the path to the config directory."""
+        return Path(__file__).parent.parent.parent.parent / "verl" / "trainer" / "config" / "critic"
+
+    def test_megatron_critic_config_instantiation_from_yaml(self, config_dir):
+        """Test that McoreCriticConfig can be instantiated from megatron_critic.yaml."""
+        yaml_path = config_dir / "megatron_critic.yaml"
+        assert yaml_path.exists(), f"Config file not found: {yaml_path}"
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/critic")):
+            test_config = compose(config_name="megatron_critic", overrides=["ppo_micro_batch_size_per_gpu=1"])
+
+        megatron_config_obj = omega_conf_to_dataclass(test_config)
+
+        assert isinstance(megatron_config_obj, McoreCriticConfig)
+        assert isinstance(megatron_config_obj, CriticConfig)
+
+        expected_attrs = [
+            "strategy",
+            "rollout_n",
+            "optim",
+            "model",
+            "ppo_mini_batch_size",
+            "ppo_max_token_len_per_gpu",
+            "cliprange_value",
+            "get",
+            "nccl_timeout",
+            "megatron",
+            "load_weight",
+        ]
+        for attr in expected_attrs:
+            assert hasattr(megatron_config_obj, attr), f"Missing attribute: {attr}"
+
+        assert callable(megatron_config_obj.get)
+        assert megatron_config_obj.strategy == "megatron"
+
+    def test_fsdp_critic_config_instantiation_from_yaml(self, config_dir):
+        """Test that FSDPCriticConfig can be instantiated from dp_critic.yaml."""
+        yaml_path = config_dir / "dp_critic.yaml"
+        assert yaml_path.exists(), f"Config file not found: {yaml_path}"
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/critic")):
+            test_config = compose(config_name="dp_critic", overrides=["ppo_micro_batch_size_per_gpu=1"])
+
+        fsdp_config_obj = omega_conf_to_dataclass(test_config)
+
+        assert isinstance(fsdp_config_obj, FSDPCriticConfig)
+        assert isinstance(fsdp_config_obj, CriticConfig)
+
+        expected_attrs = [
+            "strategy",
+            "rollout_n",
+            "optim",
+            "model",
+            "ppo_mini_batch_size",
+            "ppo_max_token_len_per_gpu",
+            "cliprange_value",
+            "get",
+            "forward_micro_batch_size",
+            "forward_micro_batch_size_per_gpu",
+            "ulysses_sequence_parallel_size",
+            "grad_clip",
+        ]
+        for attr in expected_attrs:
+            assert hasattr(fsdp_config_obj, attr), f"Missing attribute: {attr}"
+
+        assert callable(fsdp_config_obj.get)
+        assert fsdp_config_obj.strategy == "fsdp"
+
+    def test_config_inheritance_hierarchy(self):
+        """Test that the inheritance hierarchy is correct."""
+        megatron_config = McoreCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=McoreOptimizerConfig(lr=0.1))
+        assert isinstance(megatron_config, CriticConfig)
+        assert isinstance(megatron_config, McoreCriticConfig)
+
+        fsdp_config = FSDPCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=FSDPOptimizerConfig(lr=0.1))
+        assert isinstance(fsdp_config, CriticConfig)
+        assert isinstance(fsdp_config, FSDPCriticConfig)
+
+        critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=OptimizerConfig(lr=0.1))
+        assert isinstance(critic_config, CriticConfig)
+        assert not isinstance(critic_config, McoreCriticConfig)
+        assert not isinstance(critic_config, FSDPCriticConfig)
+
+    def test_config_dict_interface(self):
+        """Test that configs provide dict-like interface from BaseConfig."""
+        optim = OptimizerConfig(lr=0.1)
+        config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim)
+
+        assert "strategy" in config
+        assert config["strategy"] == "fsdp2"
+
+        assert config.get("strategy") == "fsdp2"
+        assert config.get("nonexistent_key", "default") == "default"
+
+        keys = list(config)
+        assert "strategy" in keys
+        assert "rollout_n" in keys
+
+        assert len(config) > 0
+
+    def test_frozen_fields_immutability(self):
+        """Test that frozen fields raise exceptions when modified after creation."""
+        critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=OptimizerConfig(lr=0.1))
+        frozen_fields = ["rollout_n", "strategy", "cliprange_value"]
+
+        for field_name in frozen_fields:
+            with pytest.raises((AttributeError, TypeError, ValueError)):
+                setattr(critic_config, field_name, "modified_value")
+
+        megatron_config = McoreCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=McoreOptimizerConfig(lr=0.1))
+        megatron_frozen_fields = ["nccl_timeout", "load_weight", "data_loader_seed"]
+
+        for field_name in megatron_frozen_fields:
+            with pytest.raises((AttributeError, TypeError, ValueError)):
+                setattr(megatron_config, field_name, "modified_value")
+
+        fsdp_config = FSDPCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=FSDPOptimizerConfig(lr=0.1))
+        fsdp_frozen_fields = ["ulysses_sequence_parallel_size", "grad_clip"]
+
+        for field_name in fsdp_frozen_fields:
+            with pytest.raises((AttributeError, TypeError, ValueError)):
+                setattr(fsdp_config, field_name, "modified_value")
+
+    def test_batch_size_fields_modifiable(self):
+        """Test that batch size fields can be modified after creation."""
+        optim = OptimizerConfig(lr=0.1)
+        critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim)
+
+        critic_config.ppo_mini_batch_size = 8
+        critic_config.ppo_micro_batch_size = 4
+        critic_config.ppo_micro_batch_size_per_gpu = 2
+
+        assert critic_config.ppo_mini_batch_size == 8
+        assert critic_config.ppo_micro_batch_size == 4
+        assert critic_config.ppo_micro_batch_size_per_gpu == 2
+
+        fsdp_config = FSDPCriticConfig(ppo_micro_batch_size_per_gpu=1, optim=FSDPOptimizerConfig(lr=0.1))
+
+        fsdp_config.forward_micro_batch_size = 16
+        fsdp_config.forward_micro_batch_size_per_gpu = 8
+
+        assert fsdp_config.forward_micro_batch_size == 16
+        assert fsdp_config.forward_micro_batch_size_per_gpu == 8
+
+    def test_profiler_config_type_validation(self):
+        """Test that profiler field has correct type and validation."""
+        optim = OptimizerConfig(lr=0.1)
+        critic_config = CriticConfig(ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim)
+        assert isinstance(critic_config.profiler, ProfilerConfig)
+        assert critic_config.profiler.all_ranks is False
+        assert critic_config.profiler.ranks == []
+
+        custom_profiler = ProfilerConfig(all_ranks=True, ranks=[0, 1])
+        critic_config_custom = CriticConfig(
+            profiler=custom_profiler, ppo_micro_batch_size_per_gpu=1, strategy="fsdp2", optim=optim
+        )
+        assert isinstance(critic_config_custom.profiler, ProfilerConfig)
+        assert critic_config_custom.profiler.all_ranks is True
+        assert critic_config_custom.profiler.ranks == [0, 1]
+
+        profiler1 = ProfilerConfig(enable=True, ranks=[0, 1])
+        profiler2 = ProfilerConfig(all_ranks=True, ranks=[1, 2])
+
+        union_result = profiler1.union(profiler2)
+        assert union_result.enable is True
+        assert union_result.all_ranks is True
+        assert set(union_result.ranks) == {0, 1, 2}
+
+        intersect_result = profiler1.intersect(profiler2)
+        assert intersect_result.all_ranks is False
+        assert intersect_result.ranks == [1]
+
+    def test_critic_config_validation_logic(self):
+        """Test the __post_init__ validation logic for CriticConfig."""
+        optim = OptimizerConfig(lr=0.1)
+        valid_config = CriticConfig(
+            strategy="fsdp2", ppo_micro_batch_size_per_gpu=2, use_dynamic_bsz=False, optim=optim
+        )
+        assert valid_config.ppo_micro_batch_size_per_gpu == 2
+
+        valid_config2 = CriticConfig(
+            strategy="fsdp2",
+            ppo_micro_batch_size_per_gpu=None,
+            ppo_micro_batch_size=4,
+            ppo_mini_batch_size=8,
+            use_dynamic_bsz=False,
+            optim=optim,
+        )
+        assert valid_config2.ppo_micro_batch_size == 4
+
+        dynamic_config = CriticConfig(
+            strategy="fsdp2", ppo_micro_batch_size_per_gpu=2, use_dynamic_bsz=True, optim=optim
+        )
+        assert dynamic_config.use_dynamic_bsz is True
+
+        with pytest.raises(ValueError, match="You have set both.*micro_batch_size.*AND.*micro_batch_size_per_gpu"):
+            CriticConfig(
+                strategy="fsdp2",
+                ppo_micro_batch_size=4,
+                ppo_micro_batch_size_per_gpu=2,
+                use_dynamic_bsz=False,
+                optim=optim,
+            )
+
+        with pytest.raises(
+            ValueError, match="Please set at least one of.*micro_batch_size.*or.*micro_batch_size_per_gpu"
+        ):
+            CriticConfig(
+                strategy="fsdp2",
+                ppo_micro_batch_size=None,
+                ppo_micro_batch_size_per_gpu=None,
+                use_dynamic_bsz=False,
+                optim=optim,
+            )
+
+    def test_micro_batch_size_divisibility_validation(self):
+        """Test micro batch size divisibility validation in __post_init__."""
+        optim = OptimizerConfig(lr=0.1)
+        valid_config = CriticConfig(
+            strategy="fsdp2", ppo_micro_batch_size_per_gpu=2, ppo_mini_batch_size=8, use_dynamic_bsz=False, optim=optim
+        )
+        assert valid_config.ppo_mini_batch_size == 8
+        assert valid_config.ppo_micro_batch_size_per_gpu == 2
+
+        valid_config_with_mbs = CriticConfig(
+            strategy="fsdp2", ppo_mini_batch_size=8, ppo_micro_batch_size=4, use_dynamic_bsz=False, optim=optim
+        )
+        assert valid_config_with_mbs.ppo_mini_batch_size == 8
+        assert valid_config_with_mbs.ppo_micro_batch_size == 4
+
+        with pytest.raises(ValueError, match="ppo_mini_batch_size.*must be divisible by.*ppo_micro_batch_size"):
+            CriticConfig(
+                strategy="fsdp2", ppo_mini_batch_size=7, ppo_micro_batch_size=4, use_dynamic_bsz=False, optim=optim
+            )
+
+        dynamic_config = CriticConfig(
+            strategy="fsdp2", ppo_mini_batch_size=7, ppo_micro_batch_size=4, use_dynamic_bsz=True, optim=optim
+        )
+        assert dynamic_config.use_dynamic_bsz is True
+
+    def test_fsdp_sequence_parallelism_validation(self):
+        """Test FSDP sequence parallelism validation in FSDPCriticConfig.__post_init__."""
+        valid_config = FSDPCriticConfig(
+            ppo_micro_batch_size_per_gpu=2,
+            ulysses_sequence_parallel_size=2,
+            model={"use_remove_padding": True},
+            optim=FSDPOptimizerConfig(lr=0.1),
+        )
+        assert valid_config.ulysses_sequence_parallel_size == 2
+
+        with pytest.raises(
+            ValueError, match="When using sequence parallelism for critic, you must enable.*use_remove_padding"
+        ):
+            FSDPCriticConfig(
+                ppo_micro_batch_size_per_gpu=2,
+                ulysses_sequence_parallel_size=2,
+                model={"use_remove_padding": False},
+                optim=FSDPOptimizerConfig(lr=0.1),
+            )
+
+        valid_config_no_sp = FSDPCriticConfig(
+            ppo_micro_batch_size_per_gpu=2,
+            ulysses_sequence_parallel_size=1,
+            model={"use_remove_padding": False},
+            optim=FSDPOptimizerConfig(lr=0.1),
+        )
+        assert valid_config_no_sp.ulysses_sequence_parallel_size == 1
diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_engine_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_engine_config_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1253f5c9ab9943df3c187a3c8458b35f78fe6994
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/config/test_engine_config_on_cpu.py
@@ -0,0 +1,67 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from verl.workers.config.engine import FSDPEngineConfig, McoreEngineConfig
+
+
+class TestMcoreEngineConfig:
+    def test_default_values(self):
+        config = McoreEngineConfig()
+        assert config.tensor_model_parallel_size == 1
+        assert config.sequence_parallel is False  # Should be auto-corrected
+        assert config.seed == 42
+
+    def test_post_init_validation(self):
+        # Test TP size 1 forces sequence_parallel=False
+        config = McoreEngineConfig(tensor_model_parallel_size=1)
+        assert config.sequence_parallel is False
+
+        # Test TP >1 keeps sequence_parallel=True
+        config = McoreEngineConfig(tensor_model_parallel_size=2)
+        assert config.sequence_parallel is True
+
+    def test_mutable_fields(self):
+        config = McoreEngineConfig()
+        config.sequence_parallel = True  # Should be mutable
+        with pytest.raises(AttributeError):
+            config.tensor_model_parallel_size = 2  # Frozen field
+
+    @pytest.mark.parametrize("offload_field", ["param_offload", "grad_offload", "optimizer_offload"])
+    def test_offload_flags(self, offload_field):
+        config = McoreEngineConfig(**{offload_field: True})
+        assert getattr(config, offload_field) is True
+
+
+class TestFSDPEngineConfigCPU:
+    def test_default_values(self):
+        config = FSDPEngineConfig()
+        assert config.param_offload is False
+        assert config.optimizer_offload is False
+        assert config.fsdp_size == -1
+
+    @pytest.mark.parametrize(
+        "offload_params",
+        [{"param_offload": True}, {"optimizer_offload": True}, {"param_offload": True, "optimizer_offload": True}],
+    )
+    def test_offload_combinations(self, offload_params):
+        config = FSDPEngineConfig(**offload_params)
+        assert config.param_offload == offload_params.get("param_offload", False)
+        assert config.optimizer_offload == offload_params.get("optimizer_offload", False)
+
+    def test_wrap_policy_configuration(self):
+        test_policy = {"layer_class": "TransformerBlock"}
+        config = FSDPEngineConfig(wrap_policy=test_policy)
+        assert config.wrap_policy == test_policy
diff --git a/code/RL_model/verl/verl_train/tests/workers/config/test_optim_config_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/config/test_optim_config_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b44cb40c6b1dceca7da61af2bcebeb20d0fb9b58
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/config/test_optim_config_on_cpu.py
@@ -0,0 +1,48 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from verl.workers.config.optimizer import FSDPOptimizerConfig
+
+
+class TestFSDPOptimizerConfigCPU:
+    def test_default_configuration(self):
+        config = FSDPOptimizerConfig(lr=0.1)
+        assert config.min_lr_ratio is None
+        assert config.lr_scheduler_type == "constant"
+        assert config.num_cycles == 0.5
+
+    @pytest.mark.parametrize("lr_scheduler_type", ["constant", "cosine"])
+    def test_valid_lr_scheduler_types(self, lr_scheduler_type):
+        config = FSDPOptimizerConfig(lr_scheduler_type=lr_scheduler_type, lr=0.1)
+        assert config.lr_scheduler_type == lr_scheduler_type
+
+    @pytest.mark.parametrize("warmup_style", ["constant", "cosine"])
+    def test_valid_warmup_style_types(self, warmup_style):
+        config = FSDPOptimizerConfig(warmup_style=warmup_style, lr=0.1)
+        assert config.lr_scheduler_type == warmup_style
+
+    def test_invalid_lr_scheduler_type(self):
+        with pytest.raises((ValueError, AssertionError)):
+            FSDPOptimizerConfig(lr_scheduler_type="invalid_style", lr=0.1)
+
+    def test_invalid_warmup_style_type(self):
+        with pytest.raises((ValueError, AssertionError)):
+            FSDPOptimizerConfig(warmup_style="invalid_style", lr=0.1)
+
+    @pytest.mark.parametrize("num_cycles", [0.1, 1.0, 2.5])
+    def test_num_cycles_configuration(self, num_cycles):
+        config = FSDPOptimizerConfig(num_cycles=num_cycles, lr=0.1)
+        assert config.num_cycles == num_cycles
diff --git a/code/RL_model/verl/verl_train/tests/workers/critic/test_special_dp_critic.py b/code/RL_model/verl/verl_train/tests/workers/critic/test_special_dp_critic.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6eaa10cf17ffa10a686c9530d8c291f73c98fcb
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/critic/test_special_dp_critic.py
@@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+from unittest.mock import Mock, patch
+
+import torch
+import torch.distributed
+from omegaconf import OmegaConf
+from tensordict import TensorDict
+from transformers import AutoConfig
+
+from verl import DataProto
+from verl.workers.config import FSDPCriticConfig, FSDPOptimizerConfig
+from verl.workers.config.critic import FSDPCriticModelCfg
+from verl.workers.config.engine import FSDPEngineConfig
+from verl.workers.fsdp_workers import CriticWorker
+
+
+class TestCriticWorker(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Set up distributed environment"""
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group(
+                backend="nccl" if torch.cuda.is_available() else "gloo", init_method="env://"
+            )
+
+        cls.rank = torch.distributed.get_rank()
+        cls.world_size = torch.distributed.get_world_size()
+
+        if torch.cuda.is_available():
+            torch.cuda.set_device(cls.rank)
+            cls.device = torch.device(f"cuda:{cls.rank}")
+        else:
+            cls.device = torch.device("cpu")
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up distributed environment"""
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+
+    def setUp(self):
+        """Set up test fixtures"""
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.temp_dir = tempfile.mkdtemp()
+
+        model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
+        config = AutoConfig.from_pretrained(model_path)
+        config.save_pretrained(self.temp_dir)
+
+        self.config = FSDPCriticConfig(
+            strategy="fsdp2",
+            ppo_mini_batch_size=4,
+            ppo_micro_batch_size_per_gpu=2,
+            forward_micro_batch_size_per_gpu=2,
+            ppo_epochs=1,
+            cliprange_value=0.5,
+            grad_clip=1.0,
+            use_dynamic_bsz=False,
+            ulysses_sequence_parallel_size=1,
+            rollout_n=1,
+            optim=FSDPOptimizerConfig(lr=1e-6),
+            model=FSDPCriticModelCfg(
+                path=model_path,
+                tokenizer_path=model_path,
+                fsdp_config=FSDPEngineConfig(fsdp_size=-1),
+                use_remove_padding=False,
+            ),
+        )
+        assert self.world_size <= 4 // 2
+
+    def tearDown(self):
+        """Clean up test fixtures"""
+        import shutil
+
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def _create_test_data_for_compute_values(self, batch_size=2, seq_len=10, response_len=5):
+        """Create test data for compute_values method"""
+        input_ids = torch.randint(0, 1000, (batch_size, seq_len), dtype=torch.long)
+        attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long)
+        position_ids = torch.arange(seq_len).unsqueeze(0).expand(batch_size, -1)
+        responses = torch.randint(0, 1000, (batch_size, response_len), dtype=torch.long)
+        response_mask = torch.ones(batch_size, response_len, dtype=torch.float)
+
+        batch = TensorDict(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "responses": responses,
+                "response_mask": response_mask,
+            },
+            batch_size=[batch_size],
+        )
+
+        data = DataProto(
+            batch=batch, meta_info={"micro_batch_size": 2, "max_token_len": seq_len, "use_dynamic_bsz": False}
+        )
+
+        return data
+
+    def _create_test_data_for_update_critic(self, batch_size=2, seq_len=10, response_len=5):
+        """Create test data for update_critic method"""
+        input_ids = torch.randint(0, 1000, (batch_size, seq_len), dtype=torch.long)
+        attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long)
+        position_ids = torch.arange(seq_len).unsqueeze(0).expand(batch_size, -1)
+        responses = torch.randint(0, 1000, (batch_size, response_len), dtype=torch.long)
+        response_mask = torch.ones(batch_size, response_len, dtype=torch.float)
+        values = torch.randn(batch_size, response_len, dtype=torch.float)
+        returns = torch.randn(batch_size, response_len, dtype=torch.float)
+
+        batch = TensorDict(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "responses": responses,
+                "response_mask": response_mask,
+                "values": values,
+                "returns": returns,
+            },
+            batch_size=[batch_size],
+        )
+
+        data = DataProto(
+            batch=batch,
+            meta_info={"global_token_num": [response_len] * batch_size, "batch_seqlens": [response_len] * batch_size},
+        )
+
+        return data
+
+    def test_init_model(self):
+        """Test CriticWorker.init_model() method"""
+        worker = CriticWorker(self.config)
+        worker.init_model()
+
+        self.assertIsNotNone(worker.critic_module)
+        self.assertIsNotNone(worker.critic_optimizer)
+        self.assertIsNotNone(worker.critic)
+        self.assertIsNotNone(worker.checkpoint_manager)
+
+    def test_compute_values(self):
+        """Test CriticWorker.compute_values() method"""
+        worker = CriticWorker(self.config)
+        worker.init_model()
+
+        data = self._create_test_data_for_compute_values()
+
+        result = worker.compute_values(data)
+
+        self.assertIsInstance(result, DataProto)
+        self.assertIn("values", result.batch)
+        values = result.batch["values"]
+
+        batch_size, response_len = 2, 5
+        self.assertEqual(values.shape, (batch_size, response_len))
+
+        self.assertTrue(torch.isfinite(values).all())
+
+    def test_update_critic(self):
+        """Test CriticWorker.update_critic() method"""
+        worker = CriticWorker(self.config)
+        worker.init_model()
+
+        data = self._create_test_data_for_update_critic()
+
+        result = worker.update_critic(data)
+
+        self.assertIsInstance(result, DataProto)
+        self.assertIn("metrics", result.meta_info)
+        metrics = result.meta_info["metrics"]
+
+        expected_keys = ["critic/vf_loss", "critic/vf_clipfrac", "critic/vpred_mean", "critic/grad_norm"]
+        for key in expected_keys:
+            self.assertIn(key, metrics)
+
+        for key, value in metrics.items():
+            if isinstance(value, list | tuple):
+                for v in value:
+                    self.assertTrue(torch.isfinite(torch.tensor(v)).all())
+            else:
+                self.assertTrue(torch.isfinite(torch.tensor(value)).all())
+
+    @patch("transformers.AutoConfig.from_pretrained")
+    def test_critic_attn_implementation_override_functionality(self, mock_config_from_pretrained):
+        """Test that CriticWorker correctly uses attn_implementation from override_config"""
+
+        # Mock the AutoConfig return value
+        mock_config = Mock()
+        mock_config.tie_word_embeddings = False
+        mock_config.architectures = ["LlamaForCausalLM"]
+        mock_config.num_labels = 1
+        mock_config_from_pretrained.return_value = mock_config
+
+        # Test different attn_implementation values
+        test_cases = [
+            ("eager", "eager"),
+            ("sdpa", "sdpa"),
+            ("flash_attention_2", "flash_attention_2"),
+            (None, "flash_attention_2"),  # Default case
+        ]
+
+        for override_value, expected_value in test_cases:
+            mock_config_from_pretrained.reset_mock()
+
+            # Create config with override_config
+            config_dict = {
+                "model": {
+                    "path": "/test/model/path",
+                    "tokenizer_path": "/test/tokenizer/path",
+                    "fsdp_config": {
+                        "fsdp_size": 1,
+                        "param_offload": False,
+                        "optimizer_offload": False,
+                    },
+                },
+                "optim": {"lr": 1e-4, "type": "AdamW"},
+                "strategy": "fsdp",
+                "ppo_mini_batch_size": 1,
+                "ppo_epochs": 1,
+                "rollout_n": 1,
+                "checkpoint": {"save_contents": [], "load_contents": []},
+            }
+
+            # Add override_config with attn_implementation if specified
+            if override_value is not None:
+                config_dict["model"]["override_config"] = {"attn_implementation": override_value}
+
+            # Convert to OmegaConf
+            test_config = OmegaConf.create(config_dict)
+
+            # Test the extraction logic that should happen in CriticWorker._build_critic_model_optimizer
+            override_config = OmegaConf.to_container(OmegaConf.create(test_config.model.get("override_config", {})))
+            extracted_attn_implementation = override_config.get("attn_implementation", "flash_attention_2")
+
+            # Verify the extraction works correctly
+            self.assertEqual(
+                extracted_attn_implementation,
+                expected_value,
+                f"Expected {expected_value}, got {extracted_attn_implementation} for override_value {override_value}",
+            )
+
+    def test_critic_model_config_structure(self):
+        """Test that critic model config properly incorporates override settings"""
+
+        # Test configuration scenarios
+        test_scenarios = [
+            {"name": "default_flash_attention", "override_config": {}, "expected_attn": "flash_attention_2"},
+            {"name": "eager_override", "override_config": {"attn_implementation": "eager"}, "expected_attn": "eager"},
+            {"name": "sdpa_override", "override_config": {"attn_implementation": "sdpa"}, "expected_attn": "sdpa"},
+            {
+                "name": "mixed_config",
+                "override_config": {"attn_implementation": "eager", "dropout": 0.1, "num_labels": 1},
+                "expected_attn": "eager",
+            },
+        ]
+
+        for scenario in test_scenarios:
+            with self.subTest(scenario=scenario["name"]):
+                # Simulate the config processing logic from CriticWorker
+                override_config = scenario["override_config"]
+
+                # Test the extraction logic
+                extracted_attn = override_config.get("attn_implementation", "flash_attention_2")
+
+                # Verify correct extraction
+                self.assertEqual(extracted_attn, scenario["expected_attn"], f"Failed for scenario {scenario['name']}")
+
+                # Verify other configs are preserved
+                if "dropout" in override_config:
+                    self.assertEqual(override_config["dropout"], 0.1)
+
+    def test_critic_hydra_config_compatibility(self):
+        """Test that Hydra +prefix configurations work correctly for CriticWorker"""
+
+        # Simulate Hydra configuration with +prefix for critic
+        # This would come from: +critic.model.override_config.attn_implementation=eager
+        hydra_config_dict = {
+            "critic": {"model": {"path": "/test/model/path", "override_config": {"attn_implementation": "eager"}}}
+        }
+
+        omegaconf = OmegaConf.create(hydra_config_dict)
+
+        # Extract override config as would be done in CriticWorker
+        override_model_config = OmegaConf.to_container(
+            OmegaConf.create(omegaconf.critic.model.get("override_config", {}))
+        )
+
+        # Test extraction
+        attn_implementation = override_model_config.get("attn_implementation", "flash_attention_2")
+        self.assertEqual(attn_implementation, "eager")
+
+    def test_critic_backward_compatibility(self):
+        """Test that CriticWorker maintains backward compatibility with existing configurations"""
+
+        # Test cases for backward compatibility
+        compatibility_tests = [
+            {"name": "no_override_config", "config": {}, "expected": "flash_attention_2"},
+            {"name": "empty_override_config", "config": {"override_config": {}}, "expected": "flash_attention_2"},
+            {
+                "name": "other_overrides_only",
+                "config": {"override_config": {"dropout": 0.1, "hidden_size": 768}},
+                "expected": "flash_attention_2",
+            },
+        ]
+
+        for test in compatibility_tests:
+            with self.subTest(test=test["name"]):
+                override_config = test["config"].get("override_config", {})
+                attn_implementation = override_config.get("attn_implementation", "flash_attention_2")
+
+                self.assertEqual(
+                    attn_implementation, test["expected"], f"Backward compatibility failed for {test['name']}"
+                )
+
+    def test_critic_and_actor_independent_configuration(self):
+        """Test that critic and actor can have independent attention implementation configurations"""
+
+        # Simulate a complete training configuration with both actor and critic
+        complete_config = {
+            "actor_rollout_ref": {"model": {"override_config": {"attn_implementation": "eager"}}},
+            "critic": {"model": {"override_config": {"attn_implementation": "sdpa"}}},
+        }
+
+        omegaconf = OmegaConf.create(complete_config)
+
+        # Extract actor config
+        actor_override = OmegaConf.to_container(
+            OmegaConf.create(omegaconf.actor_rollout_ref.model.get("override_config", {}))
+        )
+        actor_attn = actor_override.get("attn_implementation", "flash_attention_2")
+
+        # Extract critic config
+        critic_override = OmegaConf.to_container(OmegaConf.create(omegaconf.critic.model.get("override_config", {})))
+        critic_attn = critic_override.get("attn_implementation", "flash_attention_2")
+
+        # Verify independent configuration
+        self.assertEqual(actor_attn, "eager")
+        self.assertEqual(critic_attn, "sdpa")
+        self.assertNotEqual(actor_attn, critic_attn)  # Ensure they are indeed different
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/code/RL_model/verl/verl_train/tests/workers/reward_manager/test_registry_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/reward_manager/test_registry_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9932ae8917805e3c92bbc0e11abd398463e8e87a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/reward_manager/test_registry_on_cpu.py
@@ -0,0 +1,94 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+# Assuming REWARD_MANAGER_REGISTRY is defined somewhere in the module
+from verl.workers.reward_manager.registry import REWARD_MANAGER_REGISTRY, get_reward_manager_cls, register
+
+
+@pytest.fixture
+def setup():
+    """Setup test cases with a mock registry."""
+    REWARD_MANAGER_REGISTRY.clear()
+    REWARD_MANAGER_REGISTRY.update({"manager1": "Manager1Class", "manager2": "Manager2Class"})
+    return REWARD_MANAGER_REGISTRY
+
+
+def test_get_existing_manager(setup):
+    """Test getting an existing reward manager class."""
+    assert get_reward_manager_cls("manager1") == "Manager1Class"
+    assert get_reward_manager_cls("manager2") == "Manager2Class"
+
+
+def test_get_nonexistent_manager(setup):
+    """Test getting a non-existent reward manager raises ValueError."""
+    with pytest.raises(ValueError) as excinfo:
+        get_reward_manager_cls("unknown_manager")
+    assert "Unknown reward manager: unknown_manager" in str(excinfo.value)
+
+
+def test_case_sensitivity(setup):
+    """Test that manager names are case-sensitive."""
+    with pytest.raises(ValueError):
+        get_reward_manager_cls("MANAGER1")
+    with pytest.raises(ValueError):
+        get_reward_manager_cls("Manager1")
+
+
+def test_empty_registry(setup):
+    """Test behavior when registry is empty."""
+    REWARD_MANAGER_REGISTRY.clear()
+    with pytest.raises(ValueError) as excinfo:
+        get_reward_manager_cls("any_manager")
+    assert "Unknown reward manager: any_manager" in str(excinfo.value)
+
+
+def test_register_new_class(setup):
+    """Test registering a new class with the decorator."""
+
+    @register("test_manager")
+    class TestManager:
+        pass
+
+    assert "test_manager" in REWARD_MANAGER_REGISTRY
+    assert REWARD_MANAGER_REGISTRY["test_manager"] == TestManager
+
+
+def test_register_different_classes_same_name(setup):
+    """Test that registering different classes with same name raises ValueError."""
+
+    @register("conflict_manager")
+    class Manager1:
+        pass
+
+    with pytest.raises(ValueError):
+
+        @register("conflict_manager")
+        class Manager2:
+            pass
+
+    assert REWARD_MANAGER_REGISTRY["conflict_manager"] == Manager1
+
+
+def test_decorator_returns_original_class(setup):
+    """Test that the decorator returns the original class unchanged."""
+
+    @register("return_test")
+    class OriginalClass:
+        def method(setup):
+            return 42
+
+    assert OriginalClass().method() == 42
+    assert REWARD_MANAGER_REGISTRY["return_test"] == OriginalClass
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/perf/vllm_async_rollout.py b/code/RL_model/verl/verl_train/tests/workers/rollout/perf/vllm_async_rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7239ea88dd14f6b7fc4927388ff47273c02a34e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/perf/vllm_async_rollout.py
@@ -0,0 +1,138 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Compare vLLM AsyncLLM backend: ExternalRayDistributedExecutor(remote call) vs RayDistributedExecutor(compiled graph)
+
+1. Prepare openai/gsm8k dataset
+python3 examples/data_preprocess/gsm8k.py
+
+2. Run perf test
+python3 tests/workers/rollout/perf/vllm_async_rollout.py >perf.log 2>&1
+
+hardware: Nvidia 8*H20
+packages:
+- torch==2.6.0
+- vllm==0.8.5
+
+[DEBUG] backend: sync, n_gpus_per_node: 8, batch_size: 2048, step: 0, step_time: 21.27 secs
+[DEBUG] backend: zeromq, n_gpus_per_node: 8, batch_size: 2048, step: 0, step_time: 23.40 secs
+[DEBUG] backend: ray, n_gpus_per_node: 8, batch_size: 2048, step: 0, step_time: 25.33 secs
+"""
+
+import os
+import time
+
+import ray
+from omegaconf import DictConfig
+from torch.utils.data import SequentialSampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+
+from tests.experimental.agent_loop.agent_utils import AgentLoopManager, RayWorkerGroup, init_agent_loop_manager
+from verl.protocol import DataProto
+from verl.utils import hf_tokenizer
+from verl.utils.dataset import RLHFDataset
+from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
+
+
+def init_config(n_gpus_per_node) -> DictConfig:
+    import os
+
+    from hydra import compose, initialize_config_dir
+
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(
+            config_name="ppo_trainer",
+            overrides=[
+                "actor_rollout_ref.actor.use_dynamic_bsz=true",
+                "actor_rollout_ref.actor.fsdp_config.param_offload=True",
+                "actor_rollout_ref.actor.fsdp_config.optimizer_offload=True",
+            ],
+        )
+    config.trainer.n_gpus_per_node = n_gpus_per_node
+    config.data.train_batch_size = 128
+    config.data.return_raw_chat = True
+    config.actor_rollout_ref.model.path = "Qwen/Qwen2.5-7B-Instruct"
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2
+    config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.9
+    config.actor_rollout_ref.rollout.multi_turn.format = "hermes"
+    config.actor_rollout_ref.rollout.prompt_length = 4096
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.n = 16
+
+    return config
+
+
+def initialize(config, backend) -> tuple[AgentLoopManager | RayWorkerGroup, StatefulDataLoader]:
+    env_vars = {
+        "NCCL_DEBUG": "WARN",
+        "VLLM_USE_V1": "1",
+        "VERL_VLLM_DISTRIBUTED_BACKEND": backend,
+    }
+    ray.init(runtime_env={"env_vars": env_vars})
+
+    # STEP 1: init async llm server
+    server = init_agent_loop_manager(config)
+
+    # STEP 2: create dataloader
+    tokenizer = hf_tokenizer(config.actor_rollout_ref.model.path)
+    dataset = RLHFDataset(
+        data_files=os.path.expanduser("~/data/gsm8k/train.parquet"),
+        tokenizer=tokenizer,
+        config=config.data,
+    )
+    dataloader = StatefulDataLoader(
+        dataset=dataset,
+        batch_size=config.data.get("gen_batch_size", config.data.train_batch_size),
+        num_workers=config.data.get("dataloader_num_workers", 8),
+        drop_last=True,
+        collate_fn=default_collate_fn,
+        sampler=SequentialSampler(dataset),
+    )
+
+    return server, dataloader
+
+
+def perf_rollout(mode, backend, n_gpus_per_node, num_steps):
+    config = init_config(n_gpus_per_node)
+    config.actor_rollout_ref.rollout.mode = mode
+    agent_loop_manager, dataloader = initialize(config, backend)
+
+    for step, batch in enumerate(dataloader):
+        batch: DataProto = DataProto.from_single_dict(batch)
+        batch = batch.pop(
+            batch_keys=["input_ids", "attention_mask", "position_ids"],
+            non_tensor_batch_keys=["raw_prompt_ids", "raw_prompt"],
+        )
+        t_start = time.time()
+        gen_batch = agent_loop_manager.generate_sequences(batch)
+        t_end = time.time()
+        print(
+            f"[DEBUG] backend: {backend}, n_gpus_per_node: {n_gpus_per_node}, batch_size: {len(gen_batch)}, "
+            f"step: {step}, step_time: {t_end - t_start:.2f} secs"
+        )
+        if step + 1 >= num_steps:
+            break
+
+    ray.shutdown()
+
+
+if __name__ == "__main__":
+    num_steps = 1
+    n_gpus_per_node = 8
+
+    # test_cases = [("sync", "sync"), ("async", "zeromq"), ("async", "ray")]
+    test_cases = [("async", "zeromq"), ("async", "ray")]
+    for mode, backend in test_cases:
+        perf_rollout(mode=mode, backend=backend, n_gpus_per_node=n_gpus_per_node, num_steps=num_steps)
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config
new file mode 100644
index 0000000000000000000000000000000000000000..aa3f1eec5af8477543a487bacd602ab0d2f7390b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config
@@ -0,0 +1,17 @@
+tools:
+  - class_name: "verl.tools.sandbox_fusion_tools.SandboxFusionTool"
+    config: 
+      sandbox_fusion_url: "https://xxx.apigateway-cn-beijing.volceapi.com/run_code"
+      type: native
+    tool_schema:
+      type: "function"
+      function:
+        name: "code_interpreter"
+        description: "A tool for executing code."
+        parameters:
+          type: "object"
+          properties:
+            code:
+              type: "string"
+              description: "The code to execute."
+          required: ["code"]
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/search_tool_config b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/search_tool_config
new file mode 100644
index 0000000000000000000000000000000000000000..926b6b832f283175f92cc86b6cc4a1964096a8d3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/resource/tool_configs/search_tool_config
@@ -0,0 +1,23 @@
+tools:
+  - class_name: verl.tools.search_tool.SearchTool
+    config:
+      retrieval_service_url: http://127.0.0.1:8000/retrieve
+      num_workers: 120
+      rate_limit: 120
+      timeout: 30
+      type: native
+    tool_schema:
+      type: function
+      function:
+        name: search
+        description: Searches the web for relevant information based on the given query.
+        parameters:
+          type: object
+          properties:
+            query_list:
+              type: array
+              item:
+                type: string
+              description: A list of fully-formed semantic queries. The tool will return search results for each query.
+          required: 
+            - query_list
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_sglang/test_http_server_engine.py b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_sglang/test_http_server_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89607705fef92b7ea728cceee7275fa8054c1d0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_sglang/test_http_server_engine.py
@@ -0,0 +1,978 @@
+# Copyright 2025 z.ai
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is adapted from multiple sources:
+# 1. THUDM/slime project
+#    Original source: https://github.com/THUDM/slime/blob/main/slime/backends/sglang_utils/http_server_engine.py
+#    Copyright 2025 z.ai
+#    Licensed under the Apache License, Version 2.0
+# 2. SGLang project
+#    Original source: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server_engine.py
+#    Copyright 2023-2024 SGLang Team
+#    Licensed under the Apache License, Version 2.0
+#
+# Modifications made by z.ai and ModelBest Inc. include but are not limited to:
+# - Enhanced error handling and retry logic
+# - Added async support with connection pooling
+# - Extended functionality for distributed weight updates
+# - Improved logging and monitoring capabilities
+# - Additional configuration options and optimizations
+
+"""Complete unit tests for HTTP Server Engine Adapters.
+
+This module contains comprehensive unit tests for both HttpServerEngineAdapter
+and AsyncHttpServerEngineAdapter classes, covering all public methods,
+error handling scenarios, edge cases, and boundary conditions using pytest and mock frameworks.
+
+Tests use real SGLang modules for integration testing while mocking external dependencies.
+"""
+
+import asyncio
+from unittest.mock import AsyncMock, Mock, patch
+
+import aiohttp
+import pytest
+import requests
+from sglang.srt.managers.io_struct import (
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.utils import MultiprocessingSerializer
+
+# Import the module under test
+from verl.workers.rollout.sglang_rollout.http_server_engine import (
+    AsyncHttpServerAdapter,
+    HttpServerAdapter,
+    launch_server_process,
+)
+
+
+@pytest.fixture(scope="session")
+def event_loop():
+    """Create an event loop for the entire test session."""
+    loop = asyncio.new_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture
+def basic_adapter_kwargs():
+    """Provide basic kwargs for creating HTTP server adapters."""
+    return {
+        "host": "localhost",
+        "port": 8000,
+        "node_rank": 0,
+        "model_path": "/tmp/test_model",
+    }
+
+
+@pytest.fixture
+def router_adapter_kwargs():
+    """Provide kwargs for creating adapters with router configuration."""
+    return {
+        "router_ip": "192.168.1.1",
+        "router_port": 8080,
+        "host": "localhost",
+        "port": 8000,
+        "node_rank": 0,
+        "model_path": "/tmp/test_model",
+    }
+
+
+@pytest.fixture
+def non_master_adapter_kwargs():
+    """Provide kwargs for creating non-master node adapters."""
+    return {
+        "host": "localhost",
+        "port": 8000,
+        "node_rank": 1,  # Non-master
+        "model_path": "/tmp/test_model",
+    }
+
+
+@pytest.fixture
+def mock_launch_server_process():
+    """Mock the launch_server_process function for testing without actual server startup."""
+    from unittest.mock import patch
+
+    with patch("verl.workers.rollout.sglang_rollout.http_server_engine.launch_server_process") as mock_launch:
+        mock_process = Mock()
+        mock_process.is_alive.return_value = True
+        mock_process.pid = 12345
+        mock_launch.return_value = mock_process
+        yield mock_launch
+
+
+@pytest.fixture
+def mock_multiprocessing_process():
+    """Create mock multiprocessing.Process for testing without actual process creation."""
+    from unittest.mock import patch
+
+    with patch("verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process") as mock_process_class:
+        mock_process = Mock()
+        mock_process.is_alive.return_value = True
+        mock_process.pid = 12345
+        mock_process_class.return_value = mock_process
+        yield mock_process
+
+
+@pytest.fixture
+def mock_requests_session():
+    """Create mock requests.Session for testing HTTP interactions."""
+    from unittest.mock import patch
+
+    with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.Session") as mock_session_class:
+        mock_session = Mock()
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"status": "success"}
+        mock_session.get.return_value = mock_response
+        mock_session.post.return_value = mock_response
+        mock_session_class.return_value.__enter__.return_value = mock_session
+        yield mock_session
+
+
+@pytest.fixture
+def mock_requests_post():
+    """Mock requests.post for testing HTTP POST requests."""
+    from unittest.mock import patch
+
+    with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"status": "success"}
+        mock_post.return_value = mock_response
+        yield mock_post
+
+
+@pytest.fixture
+def mock_requests_get():
+    """Mock requests.get for testing HTTP GET requests."""
+    from unittest.mock import patch
+
+    with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get:
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"status": "success"}
+        mock_get.return_value = mock_response
+        yield mock_get
+
+
+@pytest.fixture
+def mock_aiohttp_session():
+    """Create mock aiohttp.ClientSession for testing async HTTP interactions."""
+    mock_session = AsyncMock()
+    mock_session.closed = False
+
+    # Mock response
+    mock_response = AsyncMock()
+    mock_response.status = 200
+    mock_response.json = AsyncMock(return_value={"status": "success"})
+    mock_response.raise_for_status = Mock()
+
+    # Mock context managers
+    mock_session.get.return_value.__aenter__.return_value = mock_response
+    mock_session.post.return_value.__aenter__.return_value = mock_response
+
+    return mock_session
+
+
+@pytest.fixture
+def mock_kill_process_tree():
+    """Mock kill_process_tree function for testing cleanup without actual process termination."""
+    from unittest.mock import patch
+
+    with patch("verl.workers.rollout.sglang_rollout.http_server_engine.kill_process_tree") as mock_kill:
+        yield mock_kill
+
+
+# Test environment fixtures for real SGLang testing
+@pytest.fixture(scope="session")
+def sglang_test_model_path():
+    """Provide a test model path for SGLang tests.
+
+    This can be overridden by environment variable SGLANG_TEST_MODEL_PATH
+    for tests that need a real model.
+    """
+    import os
+
+    return os.getenv("SGLANG_TEST_MODEL_PATH", "/tmp/test_model")
+
+
+@pytest.fixture
+def real_adapter_kwargs(sglang_test_model_path):
+    """Provide kwargs for creating adapters with real SGLang integration."""
+    return {
+        "host": "localhost",
+        "port": 8000,
+        "node_rank": 0,
+        "model_path": sglang_test_model_path,
+    }
+
+
+@pytest.fixture(autouse=True)
+def mock_server_args_post_init():
+    """Mock ServerArgs.__post_init__ to skip model path validation."""
+    from unittest.mock import patch
+
+    with patch(
+        "verl.workers.rollout.sglang_rollout.http_server_engine.ServerArgs.__post_init__", return_value=None
+    ) as mock_post_init:
+        yield mock_post_init
+
+
+class TestLaunchServerProcess:
+    """Test cases for launch_server_process function."""
+
+    def test_launch_server_process_success(
+        self, mock_multiprocessing_process, mock_requests_session, real_adapter_kwargs
+    ):
+        """Test successful server process launch and health check."""
+        # Import real SGLang ServerArgs
+        from sglang.srt.server_args import ServerArgs
+
+        # Create server args using real ServerArgs
+        server_args = ServerArgs(**real_adapter_kwargs)
+
+        # Test
+        with patch(
+            "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process"
+        ) as mock_process_class:
+            mock_process_class.return_value = mock_multiprocessing_process
+            with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.Session") as mock_session_class:
+                mock_session_class.return_value.__enter__.return_value = mock_requests_session
+
+                result = launch_server_process(server_args, first_rank_in_node=True)
+
+                # Assertions
+                assert result == mock_multiprocessing_process
+                mock_multiprocessing_process.start.assert_called_once()
+                assert mock_requests_session.get.call_count >= 2  # health_generate and flush_cache
+
+    def test_launch_server_process_non_master(self, mock_multiprocessing_process, non_master_adapter_kwargs):
+        """Test server launch for non-master nodes (should return immediately)."""
+        from sglang.srt.server_args import ServerArgs
+
+        server_args = ServerArgs(**non_master_adapter_kwargs)
+
+        with patch(
+            "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process"
+        ) as mock_process_class:
+            mock_process_class.return_value = mock_multiprocessing_process
+            result = launch_server_process(server_args, first_rank_in_node=True)
+
+            assert result == mock_multiprocessing_process
+            mock_multiprocessing_process.start.assert_not_called()
+
+    def test_launch_server_process_timeout(self, mock_multiprocessing_process, real_adapter_kwargs):
+        """Test timeout during server health check."""
+        from sglang.srt.server_args import ServerArgs
+
+        server_args = ServerArgs(**real_adapter_kwargs)
+
+        with patch(
+            "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process"
+        ) as mock_process_class:
+            mock_process_class.return_value = mock_multiprocessing_process
+            with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.Session") as mock_session_class:
+                mock_session = Mock()
+                mock_session.get.side_effect = requests.RequestException("Connection failed")
+                mock_session_class.return_value.__enter__.return_value = mock_session
+
+            import itertools
+
+            with patch(
+                "verl.workers.rollout.sglang_rollout.http_server_engine.time.time",
+                side_effect=itertools.chain([0], itertools.repeat(400)),  # 第一次返回0，之后一直返回400
+            ):
+                with pytest.raises(TimeoutError):
+                    launch_server_process(server_args, first_rank_in_node=True)
+
+                mock_multiprocessing_process.terminate.assert_called_once()
+
+    def test_launch_server_process_died(self, real_adapter_kwargs):
+        """Test server process dies during startup."""
+        from sglang.srt.server_args import ServerArgs
+
+        server_args = ServerArgs(**real_adapter_kwargs)
+
+        with patch(
+            "verl.workers.rollout.sglang_rollout.http_server_engine.multiprocessing.Process"
+        ) as mock_process_class:
+            mock_process = Mock()
+            mock_process.is_alive.return_value = False
+            mock_process_class.return_value = mock_process
+
+            with pytest.raises(RuntimeError, match="Server process terminated unexpectedly"):
+                launch_server_process(server_args, first_rank_in_node=True)
+
+
+class TestHttpServerEngineAdapter:
+    """Test cases for HttpServerEngineAdapter class."""
+
+    def test_init_with_router_registration(self, mock_launch_server_process, mock_requests_post, router_adapter_kwargs):
+        """Test initialization with router registration."""
+        adapter = HttpServerAdapter(**router_adapter_kwargs)
+
+        assert adapter.router_ip == "192.168.1.1"
+        assert adapter.router_port == 8080
+        assert adapter.process == mock_launch_server_process.return_value
+        mock_requests_post.assert_called_once()
+
+    def test_init_without_router(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test initialization without router registration."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        assert adapter.router_ip is None
+        assert adapter.router_port is None
+        assert adapter.process == mock_launch_server_process.return_value
+
+    def test_register_with_router_failure(self, mock_launch_server_process, router_adapter_kwargs):
+        """Test router registration failure handling."""
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            mock_post.side_effect = requests.RequestException("Connection failed")
+
+            # Should not raise exception, just log error
+            adapter = HttpServerAdapter(**router_adapter_kwargs)
+
+            assert adapter.router_ip == "192.168.1.1"
+            mock_post.assert_called_once()
+
+    def test_make_request_success(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test successful HTTP request."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = {"status": "success"}
+            mock_post.return_value = mock_response
+
+            result = adapter._make_request("test_endpoint", {"param": "value"})
+
+            assert result == {"status": "success"}
+            mock_post.assert_called_with(
+                "http://localhost:8000/test_endpoint",
+                json={"param": "value"},
+                timeout=adapter.timeout,
+            )
+
+    def test_make_request_get_method(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test HTTP GET request."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = {"data": "test"}
+            mock_get.return_value = mock_response
+
+            result = adapter._make_request("test_endpoint", method="GET")
+
+            assert result == {"data": "test"}
+            mock_get.assert_called_with("http://localhost:8000/test_endpoint", timeout=adapter.timeout)
+
+    def test_make_request_non_master(self, mock_launch_server_process):
+        """Test request from non-master node returns empty dict."""
+        kwargs = {"host": "localhost", "port": 8000, "node_rank": 1, "model_path": "/tmp/test_model"}
+        adapter = HttpServerAdapter(**kwargs)
+        result = adapter._make_request("test_endpoint")
+
+        assert result == {}
+
+    def test_make_request_retry_logic(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test retry logic for failed requests."""
+        adapter = HttpServerAdapter(max_attempts=3, **basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            with patch("time.sleep") as mock_sleep:
+                # First two calls fail, third succeeds
+                mock_post.side_effect = [
+                    requests.exceptions.Timeout(),
+                    requests.exceptions.ConnectionError(),
+                    Mock(status_code=200, json=lambda: {"success": True}),
+                ]
+
+                result = adapter._make_request("test_endpoint")
+
+                assert result == {"success": True}
+                assert mock_post.call_count == 3
+                assert mock_sleep.call_count == 2
+
+    def test_make_request_http_error(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test HTTP error handling."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            mock_response = Mock()
+            mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("404 Not Found")
+            mock_post.return_value = mock_response
+
+            with pytest.raises(requests.exceptions.HTTPError):
+                adapter._make_request("test_endpoint")
+
+    def test_make_request_max_attempts_exceeded(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test max retries exceeded."""
+        adapter = HttpServerAdapter(max_attempts=1, **basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            with patch("time.sleep"):
+                mock_post.side_effect = requests.exceptions.Timeout()
+
+                with pytest.raises(RuntimeError, match="Failed to complete request"):
+                    adapter._make_request("test_endpoint")
+
+                assert mock_post.call_count == 1  # Initial retry
+
+    def test_update_weights_from_tensor_strict(self, mock_launch_server_process, basic_adapter_kwargs):
+        import base64
+
+        from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
+
+        from verl.workers.rollout.sglang_rollout.http_server_engine import HttpServerAdapter
+
+        basic_adapter_kwargs.setdefault("node_rank", 0)
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "updated"}
+
+            req = UpdateWeightsFromTensorReqInput(
+                serialized_named_tensors=[b"tensor1", b"tensor2"],
+                load_format="safetensors",
+                flush_cache=True,
+            )
+            result = adapter.update_weights_from_tensor(req)
+
+            assert result == {"status": "updated"}
+
+            expected_b64_1 = base64.b64encode(b"tensor1").decode("utf-8")
+            expected_b64_2 = base64.b64encode(b"tensor2").decode("utf-8")
+
+            mock_request.assert_called_once_with(
+                "update_weights_from_tensor",
+                {
+                    "serialized_named_tensors": [expected_b64_1, expected_b64_2],
+                    "load_format": "safetensors",
+                    "flush_cache": True,
+                },
+            )
+
+    def test_update_weights_from_tensor_empty(self, mock_launch_server_process, basic_adapter_kwargs):
+        from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
+
+        from verl.workers.rollout.sglang_rollout.http_server_engine import HttpServerAdapter
+
+        basic_adapter_kwargs.setdefault("node_rank", 0)
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "updated"}
+
+            req = UpdateWeightsFromTensorReqInput(
+                serialized_named_tensors=[],
+                load_format="safetensors",
+                flush_cache=True,
+            )
+            result = adapter.update_weights_from_tensor(req)
+
+            assert result == {"status": "updated"}
+
+            mock_request.assert_called_once_with(
+                "update_weights_from_tensor",
+                {
+                    "serialized_named_tensors": [],
+                    "load_format": "safetensors",
+                    "flush_cache": True,
+                },
+            )
+
+    def test_update_weights_from_tensor_none(self, mock_launch_server_process, basic_adapter_kwargs):
+        from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
+
+        from verl.workers.rollout.sglang_rollout.http_server_engine import HttpServerAdapter
+
+        basic_adapter_kwargs.setdefault("node_rank", 0)
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "updated"}
+
+            req = UpdateWeightsFromTensorReqInput(
+                serialized_named_tensors=None,
+                load_format="safetensors",
+                flush_cache=True,
+            )
+            result = adapter.update_weights_from_tensor(req)
+
+            assert result == {"status": "updated"}
+
+            mock_request.assert_called_once_with(
+                "update_weights_from_tensor",
+                {
+                    "serialized_named_tensors": [],
+                    "load_format": "safetensors",
+                    "flush_cache": True,
+                },
+            )
+
+    def test_generate(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test generate method."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"text": "Generated text"}
+
+            result = adapter.generate(
+                prompt="Hello world",
+                sampling_params={"temperature": 0.7},
+                return_logprob=True,
+            )
+
+            assert result == {"text": "Generated text"}
+            mock_request.assert_called_once_with(
+                "generate",
+                {
+                    "text": "Hello world",
+                    "sampling_params": {"temperature": 0.7},
+                    "return_logprob": True,
+                },
+                only_master=False,
+            )
+
+    def test_flush_cache(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test flush_cache method."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get:
+            with patch("time.sleep") as mock_sleep:
+                # First call fails, second succeeds
+                mock_responses = [
+                    Mock(status_code=503),  # Service unavailable
+                    Mock(status_code=200, json=lambda: {"cache_flushed": True}),
+                ]
+                mock_get.side_effect = mock_responses
+
+                result = adapter.flush_cache()
+
+                assert result == {"cache_flushed": True}
+                assert mock_get.call_count == 2
+                mock_sleep.assert_called_once()
+
+    def test_flush_cache_non_master(self, mock_launch_server_process):
+        """Test flush_cache for non-master node."""
+        kwargs = {"host": "localhost", "port": 8000, "node_rank": 1, "model_path": "/tmp/test_model"}
+        adapter = HttpServerAdapter(**kwargs)
+        result = adapter.flush_cache()
+
+        assert result == {}
+
+    def test_memory_management_methods(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test memory release and resume methods."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "success"}
+
+            # Test release_memory_occupation
+            result = adapter.release_memory_occupation(["weights", "kv_cache"])
+            assert result == {"status": "success"}
+            mock_request.assert_called_with("release_memory_occupation", {"tags": ["weights", "kv_cache"]})
+
+            # Test resume_memory_occupation
+            result = adapter.resume_memory_occupation(["weights"])
+            assert result == {"status": "success"}
+            mock_request.assert_called_with("resume_memory_occupation", {"tags": ["weights"]})
+
+    def test_generation_control_methods(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test generation control methods."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "success"}
+
+    def test_shutdown(self, mock_launch_server_process, mock_kill_process_tree, router_adapter_kwargs):
+        """Test shutdown method."""
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_post.return_value = mock_response
+
+            adapter = HttpServerAdapter(**router_adapter_kwargs)
+
+            adapter.shutdown()
+
+            # Should unregister from router
+            assert mock_post.call_count == 2  # Once for registration, once for unregistration
+            # Should kill process
+            mock_kill_process_tree.assert_called_once_with(mock_launch_server_process.return_value.pid)
+
+    def test_shutdown_with_errors(self, mock_launch_server_process, mock_kill_process_tree, router_adapter_kwargs):
+        """Test shutdown method with errors."""
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            # Mock registration success but unregistration failure
+            mock_post.side_effect = [
+                Mock(status_code=200),  # Registration success
+                requests.RequestException("Unregistration failed"),  # Unregistration failure
+            ]
+
+            # Mock process kill failure
+            mock_kill_process_tree.side_effect = Exception("Kill failed")
+
+            adapter = HttpServerAdapter(**router_adapter_kwargs)
+
+            # Should not raise exceptions
+            adapter.shutdown()
+
+            assert mock_post.call_count == 2
+            mock_kill_process_tree.assert_called_once_with(mock_launch_server_process.return_value.pid)
+
+    # Edge cases for HttpServerEngineAdapter
+    def test_empty_and_none_parameters(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test handling of empty and None parameters."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "success"}
+            req = UpdateWeightsFromTensorReqInput(
+                serialized_named_tensors=None,
+                load_format=None,
+                flush_cache=None,
+            )
+
+            # Test generate with all None parameters
+            result = adapter.generate()
+            assert result == {"status": "success"}
+
+            # Test with empty lists
+            result = adapter.update_weights_from_tensor(req)
+            assert result == {"status": "success"}
+
+            # Test with empty tags
+            result = adapter.release_memory_occupation(req)
+            assert result == {"status": "success"}
+
+    def test_large_payload_handling(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test handling of large payloads."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "success"}
+
+            # Test with large tensor list
+            large_tensor_list = [MultiprocessingSerializer.serialize(f"tensor_{i}") for i in range(1000)]
+
+            req = UpdateWeightsFromTensorReqInput(
+                serialized_named_tensors=large_tensor_list,
+                load_format="safetensors",
+                flush_cache=True,
+            )
+            result = adapter.update_weights_from_tensor(req)
+            assert result == {"status": "success"}
+
+            # Test with large prompt
+            large_prompt = "A" * 10000
+            result = adapter.generate(prompt=large_prompt)
+            assert result == {"status": "success"}
+
+    def test_timeout_edge_cases(self, mock_launch_server_process):
+        """Test various timeout scenarios."""
+        # Test with very small timeout
+        kwargs = {"host": "localhost", "port": 8000, "node_rank": 0, "model_path": "/tmp/test_model", "timeout": 0.001}
+        adapter = HttpServerAdapter(**kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            mock_post.side_effect = requests.exceptions.Timeout()
+
+            with pytest.raises(RuntimeError, match="Failed to complete request"):
+                adapter._make_request("test_endpoint")
+
+    def test_extreme_configuration_values(self, mock_launch_server_process):
+        """Test extreme configuration values."""
+        # Test with extreme values
+        kwargs = {
+            "host": "localhost",
+            "port": 8000,
+            "node_rank": 0,
+            "model_path": "/tmp/test_model",
+            "timeout": 0.001,  # Very small
+            "max_attempts": 100,  # Very large
+            "retry_delay": 0.001,  # Very small
+        }
+        adapter = HttpServerAdapter(**kwargs)
+
+        assert adapter.timeout == 0.001
+        assert adapter.max_attempts == 100
+        assert adapter.retry_delay == 0.001
+
+
+class TestAsyncHttpServerEngineAdapter:
+    """Test cases for AsyncHttpServerEngineAdapter class."""
+
+    def test_init(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test async adapter initialization."""
+        adapter = AsyncHttpServerAdapter(max_connections=50, **basic_adapter_kwargs)
+
+        assert adapter.max_connections == 50
+
+    @pytest.mark.asyncio
+    async def test_make_async_request_success(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test successful async HTTP request."""
+
+        # Instantiate adapter
+        adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs)
+
+        mock_response = AsyncMock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(return_value={"status": "success"})
+        mock_response.raise_for_status = Mock()
+
+        mock_post_context_manager = AsyncMock()
+        mock_post_context_manager.__aenter__.return_value = mock_response
+
+        mock_session = AsyncMock(spec=aiohttp.ClientSession)
+        mock_session.closed = False
+        mock_session.post.return_value = mock_post_context_manager
+
+        mock_session_cm = AsyncMock()
+        mock_session_cm.__aenter__.return_value = mock_session
+
+        with patch.object(adapter, "_get_session", return_value=mock_session_cm):
+            result = await adapter._make_async_request("test_endpoint", {"param": "value"})
+
+            # Assert result is correct
+            assert result == {"status": "success"}
+
+            # Verify post was called
+            mock_session.post.assert_called_once_with(
+                "http://localhost:8000/test_endpoint", json={"param": "value"}, timeout=adapter.timeout
+            )
+
+    @pytest.mark.asyncio
+    async def test_make_async_request_get_method(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test async GET request using aiohttp and proper context mocking."""
+
+        # Instantiate the async adapter
+        adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs)
+
+        mock_response = AsyncMock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(return_value={"data": "test"})
+        mock_response.raise_for_status = Mock()
+
+        mock_get_context_manager = AsyncMock()
+        mock_get_context_manager.__aenter__.return_value = mock_response
+
+        mock_session = AsyncMock(spec=aiohttp.ClientSession)
+        mock_session.closed = False
+        mock_session.get.return_value = mock_get_context_manager
+
+        mock_session_cm = AsyncMock()
+        mock_session_cm.__aenter__.return_value = mock_session
+
+        with patch.object(adapter, "_get_session", return_value=mock_session_cm):
+            result = await adapter._make_async_request("test_endpoint", method="GET")
+
+            # Validate
+            assert result == {"data": "test"}
+            mock_session.get.assert_called_once_with("http://localhost:8000/test_endpoint", timeout=adapter.timeout)
+
+    @pytest.mark.asyncio
+    async def test_make_async_request_non_master(self, mock_launch_server_process):
+        """Test async request from non-master node."""
+        kwargs = {"host": "localhost", "port": 8000, "node_rank": 1, "model_path": "/tmp/test_model"}
+        adapter = AsyncHttpServerAdapter(**kwargs)
+        result = await adapter._make_async_request("test_endpoint")
+
+        assert result == {}
+
+    @pytest.mark.asyncio
+    async def test_async_generate(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test async generate method."""
+        adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_async_request", new_callable=AsyncMock) as mock_request:
+            mock_request.return_value = {"text": "Generated text"}
+
+            result = await adapter.generate(
+                prompt="Hello world",
+                sampling_params={"temperature": 0.7},
+                return_logprob=True,
+            )
+
+            assert result == {"text": "Generated text"}
+            mock_request.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_async_memory_management(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test async memory management methods."""
+        adapter = AsyncHttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_async_request", new_callable=AsyncMock) as mock_request:
+            mock_request.return_value = {"status": "success"}
+
+            # Test release_memory_occupation
+            result = await adapter.release_memory_occupation(["weights"])
+            assert result == {"status": "success"}
+            mock_request.assert_called_with("release_memory_occupation", {"tags": ["weights"]})
+
+            # Test resume_memory_occupation
+            result = await adapter.resume_memory_occupation(["weights"])
+            assert result == {"status": "success"}
+            mock_request.assert_called_with("resume_memory_occupation", {"tags": ["weights"]})
+            assert (
+                mock_request.call_count == 2
+            )  # resume memory occupation will also call release memory occupation once
+
+
+class TestErrorRecovery:
+    """Test error recovery mechanisms."""
+
+    def test_flush_cache_recovery(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test flush cache recovery from failures."""
+        adapter = HttpServerAdapter(max_attempts=2, **basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get:
+            # Simulate multiple failures then success
+            mock_get.side_effect = [
+                requests.exceptions.ConnectionError(),
+                requests.exceptions.Timeout(),
+                Mock(status_code=503),  # Service unavailable
+                Mock(status_code=200, json=lambda: {"cache_flushed": True}),
+            ]
+
+            with patch("time.sleep"):
+                result = adapter.flush_cache()
+                assert result == {"cache_flushed": True}
+
+    def test_flush_cache_max_attempts(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test flush cache max retries exceeded."""
+        adapter = HttpServerAdapter(max_attempts=1, **basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.get") as mock_get:
+            # All attempts fail
+            mock_get.side_effect = requests.exceptions.ConnectionError()
+
+            with patch("time.sleep"):
+                result = adapter.flush_cache()
+                assert result == {}  # Should return empty dict on failure
+
+    def test_network_partition_recovery(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test recovery from network partition scenarios."""
+        adapter = HttpServerAdapter(max_attempts=3, **basic_adapter_kwargs)
+
+        with patch("verl.workers.rollout.sglang_rollout.http_server_engine.requests.post") as mock_post:
+            # Simulate network partition then recovery
+            mock_post.side_effect = [
+                requests.exceptions.ConnectionError("Network unreachable"),
+                requests.exceptions.ConnectionError("Network unreachable"),
+                Mock(status_code=200, json=lambda: {"recovered": True}),
+            ]
+
+            with patch("time.sleep"):
+                result = adapter._make_request("test_endpoint")
+                assert result == {"recovered": True}
+
+
+class TestResourceManagement:
+    """Test resource management and cleanup."""
+
+    def test_resource_cleanup_on_exception(
+        self, mock_launch_server_process, mock_kill_process_tree, basic_adapter_kwargs
+    ):
+        """Test resource cleanup when exceptions occur."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        # Simulate exception during operation
+        with patch.object(adapter, "_make_request", side_effect=Exception("Test error")):
+            try:
+                adapter.generate(prompt="test")
+            except Exception:
+                pass
+
+        # Cleanup should still work
+        adapter.shutdown()
+        mock_kill_process_tree.assert_called_once_with(mock_launch_server_process.return_value.pid)
+
+    def test_multiple_shutdown_calls(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test multiple shutdown calls are safe."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        # Multiple shutdown calls should be safe
+        adapter.shutdown()
+        adapter.shutdown()
+        adapter.shutdown()
+
+
+class TestDataTypeHandling:
+    """Test handling of various data types."""
+
+    def test_complex_data_structures(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test handling of complex data structures."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {"status": "success"}
+
+            # Test with complex sampling params
+            complex_sampling_params = {
+                "temperature": 0.7,
+                "top_p": 0.9,
+                "top_k": 50,
+                "repetition_penalty": 1.1,
+                "stop_sequences": ["</s>", "\n\n"],
+                "max_tokens": 100,
+                "logit_bias": {"token_123": 0.5, "token_456": -0.5},
+                "nested_config": {
+                    "beam_search": True,
+                    "num_beams": 4,
+                    "early_stopping": True,
+                },
+            }
+
+            result = adapter.generate(
+                prompt="Test prompt",
+                sampling_params=complex_sampling_params,
+            )
+
+            assert result == {"status": "success"}
+            # Verify the complex structure was passed through
+            call_args = mock_request.call_args[0][1]
+            assert call_args["sampling_params"] == complex_sampling_params
+
+
+class TestIntegration:
+    """Integration tests for both adapters."""
+
+    def test_error_scenarios(self, mock_launch_server_process, basic_adapter_kwargs):
+        """Test various error scenarios."""
+        adapter = HttpServerAdapter(**basic_adapter_kwargs)
+
+        # Test with None payload
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {}
+            result = adapter.generate()
+            assert result == {}
+
+        # Test with empty parameters
+        with patch.object(adapter, "_make_request") as mock_request:
+            mock_request.return_value = {}
+            req = UpdateWeightsFromTensorReqInput(
+                serialized_named_tensors=None,
+                load_format=None,
+                flush_cache=None,
+            )
+            result = adapter.update_weights_from_tensor(req)
+            assert result == {}
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b924521705305f9c53d1b7eef0d3d70d017b2df9
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py
@@ -0,0 +1,166 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+import torch
+import torch.distributed as dist
+from torch.distributed.fsdp import CPUOffload, MixedPrecision
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.api import ShardedStateDictConfig, ShardingStrategy, StateDictType
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from vllm import SamplingParams
+
+from verl.third_party.vllm import LLM
+from verl.utils.distributed import initialize_global_process_group
+
+
+def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor) -> list[int]:
+    """Remove left padding tokens before feeding prompts to vLLM."""
+    non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+    return prompt_token_ids[non_pad_index:].tolist()
+
+
+def main():
+    assert torch.cuda.is_available(), "CUDA must be present to run FSDP vLLM example"
+    local_rank, rank, world_size = initialize_global_process_group()
+
+    local_cache_path = "~/.cache/verl/rlhf"
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = "Qwen/Qwen2-7B-Instruct"
+
+    from verl.utils.fs import copy_to_local
+
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
+    actor_model_config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=True)
+    with torch.device("cuda"):
+        actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True)
+        actor_model.to(torch.bfloat16)
+
+    max_prompt_length = 16
+    response_length = 32
+    preencode_prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    tokenizer.pad_token = tokenizer.eos_token
+    prompts = tokenizer(preencode_prompts, return_tensors="pt", padding=True)
+    input_ids = prompts["input_ids"]
+    attention_mask = prompts["attention_mask"]
+    from verl.utils.torch_functional import pad_sequence_to_length
+
+    input_ids = pad_sequence_to_length(input_ids, max_prompt_length, tokenizer.pad_token_id, left_pad=True).cuda()
+    attention_mask = pad_sequence_to_length(attention_mask, max_prompt_length, 0, left_pad=True).cuda()
+
+    from transformers import GenerationConfig
+
+    generation_config = GenerationConfig(do_sample=False)
+    actor_model.cuda()
+    output = actor_model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=32,
+        # max_length=max_length,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        generation_config=generation_config,
+        # renormalize_logits=True,
+        output_scores=False,  # this is potentially very large
+        return_dict_in_generate=True,
+        use_cache=False,
+    )  # may OOM when use_cache = True
+    seq = output.sequences
+    response = seq[:, max_prompt_length:]
+
+    print(f"hf response: {tokenizer.batch_decode(response)}")
+
+    tensor_model_parallel_size = 4
+    from torch.distributed.device_mesh import init_device_mesh
+
+    device_mesh = init_device_mesh("cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"])
+
+    mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
+    fsdp_model = FSDP(
+        actor_model,
+        use_orig_params=True,
+        auto_wrap_policy=None,
+        device_id=torch.cuda.current_device(),
+        sharding_strategy=ShardingStrategy.FULL_SHARD,
+        mixed_precision=mixed_precision,
+        cpu_offload=CPUOffload(offload_params=False),
+        sync_module_states=False,
+        device_mesh=device_mesh,
+    )
+
+    FSDP.set_state_dict_type(
+        fsdp_model, state_dict_type=StateDictType.SHARDED_STATE_DICT, state_dict_config=ShardedStateDictConfig()
+    )
+
+    state_dict = fsdp_model.state_dict()
+
+    sampling_params = SamplingParams(
+        temperature=0, top_p=1, n=1, max_tokens=response_length, logprobs=1, ignore_eos=True, detokenize=False
+    )
+
+    print(actor_model_config)
+    llm = LLM(
+        model=None,
+        tokenizer=tokenizer,
+        model_hf_config=actor_model_config,
+        tensor_parallel_size=tensor_model_parallel_size,
+        enforce_eager=True,
+        dtype="bfloat16",
+        load_format="dummy_dtensor",
+        gpu_memory_utilization=0.8,
+        trust_remote_code=True,
+    )
+
+    # Warmup iterations
+    for _ in range(10):
+        torch.cuda.synchronize()
+        llm.sync_model_weights(actor_weights=state_dict, load_format="dtensor")
+        torch.cuda.synchronize()
+        dist.barrier()
+
+    start_time = time.time()
+    llm.sync_model_weights(actor_weights=state_dict, load_format="dtensor")
+    torch.cuda.synchronize()
+    dist.barrier()
+    end_time = time.time()
+
+    # Calculate elapsed time
+    elapsed_time = end_time - start_time
+    print(f"Time taken: {elapsed_time:.6f} seconds")
+
+    input_ids = input_ids.cuda()
+    attention_mask = attention_mask.cuda()
+    idx_list = []
+    batch_size = input_ids.shape[0]
+
+    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+    for i in range(batch_size):
+        idx_list.append(_pre_process_inputs(pad_token_id, input_ids[i]))
+    print("start generation")
+    outputs = llm.generate(prompt_token_ids=idx_list, sampling_params=sampling_params, use_tqdm=False)
+    vllm_output = outputs[0].cuda()
+    if torch.distributed.get_rank() == 0:
+        print(f"hf response: {tokenizer.batch_decode(response)}")
+        print(f"vllm response: {tokenizer.batch_decode(vllm_output)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/test_vllm_abort.py b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/test_vllm_abort.py
new file mode 100644
index 0000000000000000000000000000000000000000..82034f1e9059b5c8d91e943e180d73af0f9e7d61
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/rollout_vllm/test_vllm_abort.py
@@ -0,0 +1,217 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test vLLM abort functionality.
+
+Usage:
+    pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s
+    or
+    python tests/workers/rollout/rollout_vllm/test_vllm_abort.py
+"""
+
+import asyncio
+import os
+import time
+from uuid import uuid4
+
+
+def test_vllm_abort():
+    # ==================== Configuration ====================
+    MODEL_PATH = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")  # /root/models/Qwen/Qwen2.5-1.5B-Instruct
+    GPUS_PER_NODE = 2
+    TP_SIZE = 1
+    ROLLOUT_NAME = "vllm"
+    ABORT_DELAY = 0.5  # seconds to wait before aborting
+
+    print("=" * 60)
+    print("vLLM Abort Test")
+    print("=" * 60)
+    print(f"Model: {MODEL_PATH}")
+    print(f"GPUs: {GPUS_PER_NODE}, TP Size: {TP_SIZE}")
+    print(f"Abort Delay: {ABORT_DELAY}s")
+    print("=" * 60)
+
+    # ==================== Initialize Ray ====================
+    print("\n[1] Initializing Ray...")
+    import ray
+
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        },
+        ignore_reinit_error=True,
+    )
+
+    try:
+        # ==================== Create Config ====================
+        print("\n[2] Creating config...")
+        from hydra import compose, initialize_config_dir
+
+        config_dir = os.path.abspath("verl/verl/trainer/config")
+        if not os.path.exists(config_dir):
+            config_dir = os.path.abspath("verl/trainer/config")
+
+        with initialize_config_dir(config_dir=config_dir, version_base=None):
+            config = compose(config_name="ppo_trainer")
+
+        config.trainer.n_gpus_per_node = GPUS_PER_NODE
+        config.trainer.nnodes = 1
+        config.actor_rollout_ref.model.path = MODEL_PATH
+        config.actor_rollout_ref.rollout.name = ROLLOUT_NAME
+        config.actor_rollout_ref.rollout.mode = "async"
+        config.actor_rollout_ref.rollout.tensor_model_parallel_size = TP_SIZE
+        config.actor_rollout_ref.rollout.prompt_length = 512
+        config.actor_rollout_ref.rollout.response_length = 512  # Longer for abort test
+
+        # ==================== Create Rollout Server ====================
+        print("\n[3] Creating rollout server (this may take a while)...")
+        from verl.workers.rollout.replica import get_rollout_replica_class
+
+        rollout_config = config.actor_rollout_ref.rollout
+        model_config = config.actor_rollout_ref.model
+
+        rollout_server_class = get_rollout_replica_class(ROLLOUT_NAME)
+        server = rollout_server_class(
+            replica_rank=0,
+            config=rollout_config,
+            model_config=model_config,
+            gpus_per_node=GPUS_PER_NODE,
+        )
+
+        asyncio.run(server.init_standalone())
+        server_handle = server._server_handle
+        print(f"Server address: {server._server_address}")
+
+        # ==================== Load Tokenizer ====================
+        print("\n[4] Loading tokenizer...")
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+
+        # ==================== Prepare Prompts ====================
+        print("\n[5] Preparing prompts (to ensure generation takes time)...")
+        NUM_PROMPTS = 8
+        prompts = [
+            "Write a very long story about a brave knight and dragon.",
+            "Explain the history of the Roman Empire in great detail.",
+            "Describe quantum computing and its applications thoroughly.",
+            "Write an essay about climate change and its global effects.",
+            "Who won the Champions League in 2019?",
+            "Write a detailed analysis of Shakespeare's Hamlet.",
+            "Describe the process of photosynthesis in plants.",
+            "Write about the French Revolution and its consequences.",
+        ]
+
+        all_prompt_ids = []
+        for prompt in prompts[:NUM_PROMPTS]:
+            messages = [{"role": "user", "content": prompt}]
+            prompt_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+            all_prompt_ids.append(prompt_ids)
+        print(f"Prepared {NUM_PROMPTS} prompts")
+
+        # ==================== Start Generations and Abort ====================
+        print("\n[6] Starting generations and then aborting...")
+
+        sampling_params = {
+            "temperature": 1.0,
+            "top_p": 1.0,
+            "logprobs": False,
+        }
+
+        # Start all generations concurrently
+        print(f"\n   Starting {NUM_PROMPTS} generations...")
+        generate_refs = []
+        for i, prompt_ids in enumerate(all_prompt_ids):
+            request_id = f"abort_test_{i}_{uuid4().hex[:8]}"
+            ref = server_handle.generate.remote(
+                request_id=request_id,
+                prompt_ids=prompt_ids,
+                sampling_params=sampling_params,
+                image_data=None,
+            )
+            generate_refs.append((i, request_id, ref))
+            print(f"      Started request {i}: {request_id}")
+
+        # Wait before aborting
+        print(f"\n   Waiting {ABORT_DELAY}s before abort...")
+        time.sleep(ABORT_DELAY)
+
+        # Call abort
+        print("   Calling abort_all_requests...")
+        abort_start = time.perf_counter()
+        abort_result = ray.get(server_handle.abort_all_requests.remote())
+        abort_time = time.perf_counter() - abort_start
+
+        print(f"   Abort took: {abort_time * 1000:.2f}ms")
+        print(f"   Abort result: {abort_result}")
+
+        # Wait for all generations to finish
+        print("\n   Waiting for all generations to complete...")
+        outputs = []
+        for i, request_id, ref in generate_refs:
+            try:
+                output = ray.get(ref, timeout=10.0)
+                outputs.append((i, request_id, output))
+            except ray.exceptions.GetTimeoutError:
+                print(f"      Request {i} timed out!")
+                outputs.append((i, request_id, None))
+
+        # ==================== Print Results ====================
+        print("\n" + "=" * 60)
+        print("RESULTS")
+        print("=" * 60)
+
+        aborted_count = 0
+        completed_count = 0
+        timeout_count = 0
+
+        for i, request_id, output in outputs:
+            if output is None:
+                timeout_count += 1
+                print(f"[{i}] {request_id}: TIMEOUT")
+            elif output.stop_reason == "aborted":
+                aborted_count += 1
+                print(f"[{i}] {request_id}: ABORTED ({len(output.token_ids)} tokens)")
+                print(f"Partial Output: {tokenizer.decode(output.token_ids)}")
+            else:
+                completed_count += 1
+                print(f"[{i}] {request_id}: COMPLETED ({output.stop_reason}, {len(output.token_ids)} tokens)")
+                print(f"Full Output: {tokenizer.decode(output.token_ids)}")
+
+        print(f"\nSummary: {aborted_count} aborted, {completed_count} completed, {timeout_count} timeout")
+
+        print("\n" + "=" * 60)
+        print(f"Abort result: {abort_result}")
+        print("=" * 60)
+        print("Abort test completed!")
+
+        # Assertions for pytest
+        assert timeout_count == 0, "No requests should timeout"
+        assert aborted_count + completed_count == NUM_PROMPTS, "All requests should finish"
+        assert "aborted_count" in abort_result, "Abort result should contain aborted_count"
+        assert abort_time < 1.0, "Abort should be fast (< 1 second)"
+
+    finally:
+        print("\nShutting down Ray...")
+        ray.shutdown()
+
+
+if __name__ == "__main__":
+    # Can still run as standalone script
+    test_vllm_abort()
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_hf_rollout.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_hf_rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb6f4bb2ff3f04a6127304828793151c7b24052
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_hf_rollout.py
@@ -0,0 +1,180 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from omegaconf import OmegaConf
+from torch.distributed.fsdp import CPUOffload, MixedPrecision
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.api import ShardedStateDictConfig, ShardingStrategy, StateDictType
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from verl import DataProto
+from verl.utils.distributed import initialize_global_process_group
+from verl.utils.fs import copy_to_local
+from verl.utils.model import compute_position_id_with_mask
+from verl.workers.rollout.hf_rollout import HFRollout
+
+BASE_HF_ROLLOUT_CONFIG = {
+    "temperature": 1.0,
+    "top_k": -1,
+    "top_p": 1,
+    "prompt_length": 64,
+    "response_length": 64,
+    "do_sample": True,
+    "n": 1,
+    "val_kwargs": {
+        "top_k": -1,
+        "top_p": 1.0,
+        "temperature": 0,
+        "n": 1,
+        "do_sample": False,
+    },
+}
+
+
+def prepare_input_dataproto(tokenizer, config, validate):
+    preencode_prompts = [
+        [{"role": "user", "content": "Who won the Champions League in 2019?"}],
+        [{"role": "user", "content": "The founder of Apple is"}],
+        [{"role": "user", "content": "What's your name"}],
+    ]
+    formatted_prompts = [
+        tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+        for conversation in preencode_prompts
+    ]
+    prompts = tokenizer(formatted_prompts, return_tensors="pt", padding="max_length", max_length=config.prompt_length)
+    input_dataproto = DataProto.from_dict(
+        {
+            "input_ids": prompts["input_ids"],
+            "attention_mask": prompts["attention_mask"],
+            "position_ids": compute_position_id_with_mask(prompts["attention_mask"]),
+        },
+        meta_info={
+            "bos_token_id": tokenizer.bos_token_id,
+            "eos_token_id": tokenizer.eos_token_id,
+            "pad_token_id": tokenizer.pad_token_id,
+            "validate": validate,
+        },
+    )
+    return input_dataproto
+
+
+def prepare_fsdp_model(model, world_size):
+    from torch.distributed.device_mesh import init_device_mesh
+
+    device_mesh = init_device_mesh("cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"])
+
+    mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
+
+    fsdp_model = FSDP(
+        model,
+        use_orig_params=True,
+        auto_wrap_policy=None,
+        device_id=torch.cuda.current_device(),
+        sharding_strategy=ShardingStrategy.FULL_SHARD,
+        mixed_precision=mixed_precision,
+        cpu_offload=CPUOffload(offload_params=False),
+        sync_module_states=False,
+        device_mesh=device_mesh,
+    )
+
+    FSDP.set_state_dict_type(
+        fsdp_model, state_dict_type=StateDictType.SHARDED_STATE_DICT, state_dict_config=ShardedStateDictConfig()
+    )
+    return fsdp_model
+
+
+def test_hf_rollout(n: int = 1, do_sample: bool = True, validate: bool = False):
+    config = OmegaConf.create(BASE_HF_ROLLOUT_CONFIG)
+    config.update({"n": n, "do_sample": do_sample})
+
+    assert torch.cuda.device_count() >= 2, "At least 2 GPUs is required to run tp+dp tests."
+    local_rank, rank, world_size = initialize_global_process_group()
+
+    # Initialize model and tokenizer
+    local_cache_path = "~/.cache/verl/rlhf"
+    local_cache_path = os.path.expanduser(local_cache_path)
+    hdfs_path = "Qwen/Qwen2-7B-Instruct"
+    local_model_path = copy_to_local(src=hdfs_path, cache_dir=local_cache_path)
+    tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side="left", trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Initialize FSDP model
+    actor_model = AutoModelForCausalLM.from_pretrained(local_model_path, trust_remote_code=True)
+    actor_model.to(torch.bfloat16)
+    fsdp_model = prepare_fsdp_model(actor_model, world_size)
+
+    # Initialize HFRollout and start generate
+    hf_rollout = HFRollout(fsdp_model, OmegaConf.create(config))
+    input = prepare_input_dataproto(tokenizer, config, validate).to(torch.cuda.current_device())
+    outputs = hf_rollout.generate_sequences(input)
+
+    # check generated batch size is expected
+    generated_batch_size = outputs.batch.batch_size[0]
+    assert generated_batch_size == input.batch.batch_size[0] * config.n
+
+    for i in range(generated_batch_size):
+        prompt_tokens = outputs.batch["prompts"][i]
+        prompt_mask = prompt_tokens != tokenizer.pad_token_id
+        prompt_tokens = prompt_tokens[prompt_mask]
+        decoded_prompt = tokenizer.decode(prompt_tokens, skip_special_tokens=False)
+
+        response_tokens = outputs.batch["responses"][i]
+        response_mask = response_tokens != tokenizer.pad_token_id
+        response_tokens = response_tokens[response_mask]
+        decoded_response = tokenizer.decode(response_tokens, skip_special_tokens=False)
+
+        attention_mask = outputs.batch["attention_mask"][i]
+        position_ids = outputs.batch["position_ids"][i]
+        prompt_length = outputs.batch["prompts"].size(1)
+        response_length = outputs.batch["responses"].size(1)
+
+        assert attention_mask.size(0) == prompt_length + response_length
+        assert position_ids.size(0) == prompt_length + response_length
+
+        # check response attention mask is expected
+        response_attention = attention_mask[prompt_length:]
+        eos_positions = (outputs.batch["responses"][i] == tokenizer.pad_token_id).nonzero(as_tuple=True)[0]
+        if len(eos_positions) > 0:
+            first_eos_pos = eos_positions[0].item()
+            assert response_attention[: first_eos_pos + 1].all(), "Response attention mask should be 1 until EOS"
+            if first_eos_pos + 1 < response_length:
+                assert not response_attention[first_eos_pos + 1 :].any(), (
+                    "Response attention mask should be 0 after EOS"
+                )
+        else:
+            assert response_attention.all(), "Response attention mask should be all 1 if no EOS token"
+
+        # check response position ids is expected
+        prompt_positions = position_ids[:prompt_length]
+        response_positions = position_ids[prompt_length:]
+        valid_response_length = min(len(response_tokens), response_length)
+        if valid_response_length > 0:
+            assert response_positions[0] == prompt_positions[-1] + 1
+            for j in range(1, valid_response_length):
+                assert response_positions[j] == response_positions[j - 1] + 1
+
+        # print generated text for inspection
+        if torch.distributed.get_rank() == 0:
+            print(f"prompt: {decoded_prompt}")
+            print(f"response: {decoded_response}")
+            print("=" * 30)
+
+
+if __name__ == "__main__":
+    test_hf_rollout(n=2, do_sample=True, validate=False)
+    # test_hf_rollout(n=1, do_sample=False, validate=True)
+    # test_hf_rollout(n=1, do_sample=True, validate=False)
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_async_rollout_multimodal_delta.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_async_rollout_multimodal_delta.py
new file mode 100644
index 0000000000000000000000000000000000000000..dea1b14eaf6bf13e09f4653ff02a0b7208160794
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_async_rollout_multimodal_delta.py
@@ -0,0 +1,194 @@
+# Copyright 2025 Amazon.com, Inc. or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+import pytest
+
+from verl.tools.schemas import ToolResponse
+from verl.utils.dataset.vision_utils import process_image
+from verl.utils.tokenizer import hf_processor
+from verl.workers.rollout.schemas import (
+    AsyncRolloutRequest,
+    AsyncRolloutRequestStateEnum,
+    TokenizationSanityCheckModeEnum,
+)
+
+
+def _test_add_tool_response_messages_image_delta(processor, image_list, description_list, resize_image=False):
+    assert len(image_list) == len(description_list)
+    # Get the smallest dimensions across all images
+    processed_images = []
+    for img_url in image_list:
+        img = process_image(img_url)
+        processed_images.append(img)
+
+    min_width = min(img.size[0] for img in processed_images)
+    min_height = min(img.size[1] for img in processed_images)
+    min_size = (min_width, min_height)
+
+    if resize_image:
+        processed_images_resized = []
+        for img in processed_images:
+            img = img.resize(min_size)
+            processed_images_resized.append(img)
+        processed_images = processed_images_resized
+
+    # Initial message history
+    system_prompt = (
+        "You will be provided with an image. Describe this image and then generate a new image for the next round"
+    )
+    messages = [
+        {
+            "role": "system",
+            "content": system_prompt,
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Here is the first image provided: "},
+                {"type": "image", "image": [processed_images[0]]},
+            ],
+        },
+    ]
+
+    # Initial multi_modal_data with one image
+    multi_modal_data = {"image": [processed_images[0]], "video": []}
+    # Minimal required fields for AsyncRolloutRequest
+
+    req = AsyncRolloutRequest(
+        batch_data_id=0,
+        request_id="test-req-1",
+        state=AsyncRolloutRequestStateEnum.PENDING,
+        messages=messages,
+        multi_modal_keys=["image", "video"],
+        multi_modal_data=multi_modal_data.copy(),
+        tool_schemas=[],
+        tools_kwargs={},
+        interaction_kwargs={},
+        input_ids=None,
+        prompt_ids=None,
+        response_ids=None,
+        attention_mask=None,
+        prompt_attention_mask=None,
+        response_attention_mask=None,
+        position_ids=None,
+        prompt_position_ids=None,
+        response_position_ids=None,
+        loss_mask=None,
+        prompt_loss_mask=None,
+        response_loss_mask=None,
+        reward_scores={},
+        max_prompt_len=8192,
+        max_response_len=8192,
+        max_model_len=16384,
+        metrics={},
+        use_inference_chat_template=True,
+        tokenization_sanity_check_mode=TokenizationSanityCheckModeEnum.STRICT,
+        generation_prompt_ids=None,
+        base_conv_wo_gen_prompt_end_pos=0,
+        base_conv_with_gen_prompt_end_pos=0,
+        processing_class=processor,
+    )
+
+    prev_generated_len = 0
+    # Add First Assistant Message and first tool response message(image)
+    for idx, img in enumerate(processed_images):
+        if idx == 0:
+            continue
+        _ = req.get_generation_prompt_ids(processor)
+        req.add_assistant_message(processor, content=description_list[idx - 1])
+        before_tool_call_len = req.input_ids.shape[-1]
+        req.add_tool_response_messages(
+            processor, [ToolResponse(image=[img], text="Here is the new image you requested: ")]
+        )
+        after_tool_call_len = req.input_ids.shape[-1]
+        if prev_generated_len == 0:
+            prev_generated_len = after_tool_call_len - before_tool_call_len
+        else:
+            if resize_image:
+                assert after_tool_call_len - before_tool_call_len == prev_generated_len
+        assert req.multi_modal_data["image"] == processed_images[: idx + 1]
+
+    _ = req.get_generation_prompt_ids(processor)
+    req.add_assistant_message(processor, content=description_list[-1])
+
+    messages = [msg.model_dump() for msg in req.messages]
+    tools = [tool.model_dump() for tool in req.tool_schemas] if req.tool_schemas else None
+    full_prompt_info = req._handle_apply_chat_template(
+        processor,
+        messages,
+        multi_modal_data=req.multi_modal_data,
+        tools=tools,
+        add_generation_prompt=False,
+        tokenize=True,
+        return_dict=True,
+    )
+    full_prompt_ids = full_prompt_info["input_ids"]
+    assert full_prompt_ids.eq(req.input_ids).all()
+
+    # We must use dict(full_prompt_info) to convert BatchFeature values to a new dict
+    # because np.array() only keeps the keys for BatchFeature.
+    full_prompt_multi_modal_inputs = full_prompt_info.copy()
+    full_prompt_multi_modal_inputs.pop("input_ids", None)
+    full_prompt_multi_modal_inputs.pop("attention_mask", None)
+
+    for key in full_prompt_multi_modal_inputs:
+        assert full_prompt_multi_modal_inputs[key].eq(req.multi_modal_inputs[key]).all()
+
+
+@pytest.mark.skipif(
+    hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")) is None,
+    reason="Processor not available for Qwen/Qwen2.5-VL-B-Instruct",
+)
+def test_add_tool_response_messages_image_delta():
+    processor = hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct"))
+
+    # From Qwen2.5-VL-3B-Instruct HF example
+    img_1_url = {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}
+    img_1_description = "A woman sits on the beach at sunset, smiling as she shares a high five with her large dog."
+    # GitHub Logo
+    img_2_url = {"image": "https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png"}
+    img_2_description = "A GitHub Logo image"
+    # Octocat
+    img_3_url = {"image": "https://octodex.github.com/images/orderedlistocat.png"}
+    img_3_description = "An Octocat image"
+
+    image_list = [img_1_url, img_2_url, img_3_url]
+    description_list = [img_1_description, img_2_description, img_3_description]
+    _test_add_tool_response_messages_image_delta(processor, image_list, description_list, resize_image=False)
+
+
+@pytest.mark.skipif(
+    hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")) is None,
+    reason="Processor not available for Qwen/Qwen2.5-VL-B-Instruct",
+)
+def test_add_tool_response_messages_image_delta_resize_image():
+    processor = hf_processor(os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct"))
+
+    # From Qwen2.5-VL-3B-Instruct HF example
+    img_1_url = {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}
+    img_1_description = "A woman sits on the beach at sunset, smiling as she shares a high five with her large dog."
+    # GitHub Logo
+    img_2_url = {"image": "https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png"}
+    img_2_description = "A GitHub Logo image"
+    # Octocat
+    img_3_url = {"image": "https://octodex.github.com/images/orderedlistocat.png"}
+    img_3_description = "An Octocat image"
+
+    image_list = [img_1_url, img_2_url, img_3_url]
+    description_list = [img_1_description, img_2_description, img_3_description]
+    _test_add_tool_response_messages_image_delta(processor, image_list, description_list, resize_image=True)
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_rollout_sharding_manager.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_rollout_sharding_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3c7b5da2bea7c5ba757ba2b42cc30f58890eb7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_sglang_rollout_sharding_manager.py
@@ -0,0 +1,57 @@
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from verl.workers.rollout.sglang_rollout.utils import get_named_tensor_buckets
+
+_TENSOR_1MB = torch.zeros(512, 512)
+_BYTES_1MB = 1 << 20
+
+
+@pytest.mark.parametrize(
+    "named_tensors, bucket_size_mb, gt_groups",
+    [
+        (
+            [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)],
+            0.5 * _BYTES_1MB,
+            [["a"], ["b"]],
+        ),
+        (
+            [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)],
+            1 * _BYTES_1MB,
+            [["a"], ["b"]],
+        ),
+        (
+            [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)],
+            1.5 * _BYTES_1MB,
+            [["a"], ["b"]],
+        ),
+        (
+            [("a", _TENSOR_1MB), ("b", _TENSOR_1MB)],
+            2 * _BYTES_1MB,
+            [["a", "b"]],
+        ),
+    ],
+)
+def test_get_named_tensor_buckets(named_tensors, bucket_size_mb, gt_groups: list[list[str]]):
+    named_tensors_iter = iter(named_tensors)
+    groups = list(get_named_tensor_buckets(named_tensors_iter, bucket_size_mb))
+    assert len(groups) == len(gt_groups)
+    for group, gt_group in zip(groups, gt_groups, strict=True):
+        assert len(group) == len(gt_group)
+        for (name, _), (gt_name) in zip(group, gt_group, strict=True):
+            assert name == gt_name
diff --git a/code/RL_model/verl/verl_train/tests/workers/rollout/test_vllm_cli_args_on_cpu.py b/code/RL_model/verl/verl_train/tests/workers/rollout/test_vllm_cli_args_on_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db46ab48359087e9979d6efd6ce787913b3e5d4
--- /dev/null
+++ b/code/RL_model/verl/verl_train/tests/workers/rollout/test_vllm_cli_args_on_cpu.py
@@ -0,0 +1,133 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import pytest
+
+from verl.workers.rollout.vllm_rollout.utils import build_cli_args_from_config
+
+
+class TestBuildCliArgsFromConfig:
+    """Tests for CLI argument serialization from config dictionaries."""
+
+    def test_string_value(self):
+        """String values become '--key value'."""
+        config = {"model": "gpt2"}
+        result = build_cli_args_from_config(config)
+        assert result == ["--model", "gpt2"]
+
+    def test_integer_value(self):
+        """Integer values are converted to strings."""
+        config = {"tensor-parallel-size": 4}
+        result = build_cli_args_from_config(config)
+        assert result == ["--tensor-parallel-size", "4"]
+
+    def test_float_value(self):
+        """Float values are converted to strings."""
+        config = {"temperature": 0.7}
+        result = build_cli_args_from_config(config)
+        assert result == ["--temperature", "0.7"]
+
+    def test_bool_true(self):
+        """Bool True adds flag without value."""
+        config = {"enable-prefix-caching": True}
+        result = build_cli_args_from_config(config)
+        assert result == ["--enable-prefix-caching"]
+
+    def test_bool_false(self):
+        """Bool False is skipped entirely."""
+        config = {"enable-prefix-caching": False}
+        result = build_cli_args_from_config(config)
+        assert result == []
+
+    def test_none_value(self):
+        """None values are skipped."""
+        config = {"lora-path": None}
+        result = build_cli_args_from_config(config)
+        assert result == []
+
+    def test_list_values(self):
+        """List values are expanded into multiple arguments."""
+        config = {"cudagraph-capture-sizes": [1, 2, 4, 8]}
+        result = build_cli_args_from_config(config)
+        assert result == ["--cudagraph-capture-sizes", "1", "2", "4", "8"]
+
+    def test_empty_list(self):
+        """Empty lists are skipped (vLLM nargs='+' requires at least one value)."""
+        config = {"cudagraph-capture-sizes": []}
+        result = build_cli_args_from_config(config)
+        assert result == []
+
+    def test_list_with_strings(self):
+        """List of strings is properly expanded."""
+        config = {"allowed-origins": ["http://localhost", "http://example.com"]}
+        result = build_cli_args_from_config(config)
+        assert result == ["--allowed-origins", "http://localhost", "http://example.com"]
+
+    def test_dict_value(self):
+        """Dict values are JSON serialized."""
+        config = {"extra-config": {"key": "value", "nested": True}}
+        result = build_cli_args_from_config(config)
+        assert result[0] == "--extra-config"
+        # JSON output may have different key ordering, so parse and compare
+        assert json.loads(result[1]) == {"key": "value", "nested": True}
+
+    def test_mixed_config(self):
+        """Test a realistic mixed configuration."""
+        config = {
+            "tensor-parallel-size": 4,
+            "enable-prefix-caching": True,
+            "disable-log-requests": False,
+            "lora-path": None,
+            "cudagraph-capture-sizes": [1, 2, 4, 8],
+            "max-model-len": 2048,
+        }
+        result = build_cli_args_from_config(config)
+
+        # Check expected args are present
+        assert "--tensor-parallel-size" in result
+        assert "4" in result
+        assert "--enable-prefix-caching" in result
+        assert "--cudagraph-capture-sizes" in result
+        assert "1" in result
+        assert "8" in result
+        assert "--max-model-len" in result
+        assert "2048" in result
+
+        # Check skipped values are not present
+        assert "--disable-log-requests" not in result
+        assert "--lora-path" not in result
+
+    def test_preserves_order(self):
+        """Arguments should preserve dictionary order (Python 3.7+)."""
+        config = {"first": "a", "second": "b", "third": "c"}
+        result = build_cli_args_from_config(config)
+        assert result == ["--first", "a", "--second", "b", "--third", "c"]
+
+    def test_empty_config(self):
+        """Empty config returns empty list."""
+        config = {}
+        result = build_cli_args_from_config(config)
+        assert result == []
+
+    def test_single_element_list(self):
+        """Single element list works correctly."""
+        config = {"sizes": [42]}
+        result = build_cli_args_from_config(config)
+        assert result == ["--sizes", "42"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])