diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b56b4a44e4a55540c6fdc3e0873f58a409f6e2b8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: true
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20e1de6e9e0c9570b2e880659fb8419389098566
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..524aa3462c384f98ce183bddb516e57eb44f10c1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b56b4a44e4a55540c6fdc3e0873f58a409f6e2b8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: true
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b957d355a727fca1d251b04fca5845220317331f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..524aa3462c384f98ce183bddb516e57eb44f10c1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b56b4a44e4a55540c6fdc3e0873f58a409f6e2b8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: true
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c290ffc0bc650dd0d7560f57f3dec106ac85c173
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..524aa3462c384f98ce183bddb516e57eb44f10c1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: false
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af205efd98c461db581b5e14fb2d45f78553bd73
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=False
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=False
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: false
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..161200400716bc2ccb65b8e81dea22b9c52b1c5f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=False
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=False
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: false
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c6ea002941bc3746cc9ed357c9b0ced4c1e5969
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=False
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=False
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: false
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7792025f706af9ad26eb8c208a4ba75b6ebf08af
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=False
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=False
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/config.yaml
@@ -0,0 +1,169 @@
+data:
+ tokenizer: null
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 4096
+ max_response_length: 1024
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 128
+ val_batch_size: 64
+ return_raw_input_ids: false
+ return_raw_chat: false
+ shuffle_train_dataloader: true
+actor_rollout_ref:
+ hybrid_engine: true
+ model:
+ path: Qwen/Qwen3-4B-Instruct-2507
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ use_remove_padding: false
+ actor:
+ strategy: fsdp
+ ppo_mini_batch_size: 64
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ grad_clip: 1.0
+ state_masking: false
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: false
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ ulysses_sequence_parallel_size: 1
+ optim:
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ grad_offload: false
+ optimizer_offload: true
+ fsdp_size: -1
+ ppo_micro_batch_size_per_gpu: 16
+ ref:
+ fsdp_config:
+ param_offload: true
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length}
+ response_length: ${data.max_response_length}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ free_cache_engine: true
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 64
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ do_sample: true
+ 'n': 1
+ n_agent: 1
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ min_lr_ratio: null
+ warmup_style: constant
+ total_training_steps: -1
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: false
+ use_remove_padding: false
+ fsdp_config:
+ param_offload: false
+ grad_offload: false
+ optimizer_offload: false
+ wrap_policy:
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+reward_model:
+ enable: false
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: false
+ fsdp_config:
+ min_num_params: 0
+ param_offload: false
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+retriever:
+ url: http://127.0.0.1:8000/retrieve
+ topk: 3
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ no_think_rl: false
+ kl_penalty: kl
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker:
+ end_state_marker:
+trainer:
+ total_epochs: 15
+ total_training_steps: 1005
+ project_name: ''
+ experiment_name: llm_guard_3B_10k_v2
+ logger:
+ - wandb
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 100
+ test_freq: 50
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2
+max_turns: 1
+do_search: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfd02ebf37d6209e13bbe8cbf1a9cb005bc99507
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/hydra.yaml
@@ -0,0 +1,189 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+ - data.train_batch_size=128
+ - data.val_batch_size=64
+ - data.max_prompt_length=4096
+ - data.max_response_length=1024
+ - data.shuffle_train_dataloader=True
+ - algorithm.adv_estimator=grpo
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.model.enable_gradient_checkpointing=true
+ - actor_rollout_ref.model.use_remove_padding=False
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.actor.ppo_mini_batch_size=64
+ - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.fsdp_config.param_offload=true
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.ref.log_prob_micro_batch_size=64
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - trainer.logger=[wandb]
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=100
+ - trainer.test_freq=50
+ - trainer.project_name=
+ - trainer.experiment_name=llm_guard_3B_10k_v2
+ - trainer.total_epochs=15
+ - trainer.total_training_steps=1005
+ - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+ - do_search=false
+ - max_turns=1
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57
+ choices:
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/overrides.yaml
@@ -0,0 +1,35 @@
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet
+- data.train_batch_size=128
+- data.val_batch_size=64
+- data.max_prompt_length=4096
+- data.max_response_length=1024
+- data.shuffle_train_dataloader=True
+- algorithm.adv_estimator=grpo
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.model.enable_gradient_checkpointing=true
+- actor_rollout_ref.model.use_remove_padding=False
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.actor.ppo_mini_batch_size=64
+- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.fsdp_config.param_offload=true
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true
+- actor_rollout_ref.rollout.log_prob_micro_batch_size=64
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.ref.log_prob_micro_batch_size=64
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- trainer.logger=[wandb]
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=100
+- trainer.test_freq=50
+- trainer.project_name=
+- trainer.experiment_name=llm_guard_3B_10k_v2
+- trainer.total_epochs=15
+- trainer.total_training_steps=1005
+- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2
+- do_search=false
+- max_turns=1
diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/__init__.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b68cb003ac3f943d45eb8d5cf48a7ebee5cd1f6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py
@@ -0,0 +1,469 @@
+import torch
+import re
+from collections import defaultdict
+import os
+from typing import List, Dict, Any, Tuple
+from dataclasses import dataclass
+from .tensor_helper import TensorHelper, TensorConfig
+from verl import DataProto
+from verl.utils.tracking import Tracking
+import shutil
+import requests
+
+@dataclass
+class GenerationConfig:
+ max_turns: int
+ max_start_length: int
+ max_prompt_length: int
+ max_response_length: int
+ max_obs_length: int
+ num_gpus: int
+ no_think_rl: bool=False
+ search_url: str = None
+ topk: int = 3
+
+class LLMGenerationManager:
+ def __init__(
+ self,
+ tokenizer,
+ actor_rollout_wg,
+ config: GenerationConfig,
+ is_validation: bool = False,
+ ):
+ self.tokenizer = tokenizer
+ self.actor_rollout_wg = actor_rollout_wg
+ self.config = config
+ self.is_validation = is_validation
+
+ self.tensor_fn = TensorHelper(TensorConfig(
+ pad_token_id=tokenizer.pad_token_id,
+ max_prompt_length=config.max_prompt_length,
+ max_obs_length=config.max_obs_length,
+ max_start_length=config.max_start_length
+ ))
+
+ def _batch_tokenize(self, responses: List[str]) -> torch.Tensor:
+ """Tokenize a batch of responses."""
+ return self.tokenizer(
+ responses,
+ add_special_tokens=False,
+ return_tensors='pt',
+ padding="longest"
+ )['input_ids']
+
+ def _postprocess_responses(self, responses: torch.Tensor) -> torch.Tensor:
+ """Process responses to stop at search operation or answer operation."""
+ responses_str = self.tokenizer.batch_decode(
+ responses,
+ skip_special_tokens=True
+ )
+
+ responses_str = [resp.split('')[0] + ''
+ if '' in resp
+ else resp.split('')[0] + ''
+ if '' in resp
+ else resp
+ for resp in responses_str]
+
+ if self.config.no_think_rl:
+ raise ValueError('stop')
+ # if no_think_rl is enabled, only keep action in the str
+ actions, _ = self.env.postprocess_predictions(responses_str)
+ responses_str=[f"{envs[idx].ACTION_LOOKUP[action]}" for idx, action in enumerate(actions)]
+ print("RESPONSES:", responses_str)
+ responses = self._batch_tokenize(responses_str)
+ return responses, responses_str
+
+ def _process_next_obs(self, next_obs: List[str]) -> torch.Tensor:
+ """Process next observations from environment."""
+
+ next_obs_ids = self.tokenizer(
+ next_obs,
+ padding='longest',
+ return_tensors='pt',
+ add_special_tokens=False, # Prevents adding special tokens
+ )['input_ids']
+
+ if next_obs_ids.shape[1] > self.config.max_obs_length:
+ print(f"[WARNING] OBSERVATION TOO LONG, CONSIDER CHANGING YOUR CONFIG, {next_obs_ids.shape[1]} & {self.config.max_obs_length}")
+ next_obs_ids = next_obs_ids[:, :self.config.max_obs_length]
+
+ return next_obs_ids
+
+ def _update_rolling_state(self, rollings: DataProto, cur_responses: torch.Tensor,
+ next_obs_ids: torch.Tensor) -> Dict:
+ """Update rolling state with new responses and observations."""
+ # Concatenate and handle padding
+ new_input_ids = self.tensor_fn.concatenate_with_padding([
+ rollings.batch['input_ids'],
+ cur_responses,
+ next_obs_ids
+ ])
+
+ # Create attention mask and position ids
+ new_attention_mask = self.tensor_fn.create_attention_mask(new_input_ids)
+ new_position_ids = self.tensor_fn.create_position_ids(new_attention_mask)
+
+ # Cut to appropriate length
+ effective_len = new_attention_mask.sum(dim=1).max()
+ max_len = min(self.config.max_prompt_length, effective_len)
+
+ new_rollings = DataProto.from_dict({
+ 'input_ids': new_input_ids[:, -max_len:],
+ 'position_ids': new_position_ids[:, -max_len:],
+ 'attention_mask': new_attention_mask[:, -max_len:]
+ })
+ new_rollings.meta_info.update(rollings.meta_info)
+
+ return new_rollings
+
+ def _info_masked_concatenate_with_padding(self,
+ prompt: torch.Tensor,
+ prompt_with_mask: torch.Tensor,
+ response: torch.Tensor,
+ info: torch.Tensor = None,
+ pad_to_left: bool = True
+ ) -> torch.Tensor:
+ """Concatenate tensors and handle padding. Additionally, create a mask (info_mask) to cover the information block if it exists."""
+ pad_id = self.tokenizer.pad_token_id
+ tensors = [prompt, response]
+ tensors_with_mask = [prompt_with_mask, response]
+ if info is not None:
+ tensors.append(info)
+ info_mask = torch.full(info.size(), pad_id, dtype=info.dtype, device=info.device) # information mask
+ tensors_with_mask.append(info_mask)
+
+ concatenated = torch.cat(tensors, dim=1)
+ concatenated_with_info = torch.cat(tensors_with_mask, dim=1)
+ mask = concatenated != pad_id if pad_to_left else concatenated == pad_id
+ sorted_indices = mask.to(torch.int64).argsort(dim=1, stable=True)
+ padded_tensor = concatenated.gather(1, sorted_indices)
+ padded_tensor_with_info = concatenated_with_info.gather(1, sorted_indices)
+
+ return padded_tensor, padded_tensor_with_info
+
+ def _update_right_side(self, right_side: Dict,
+ cur_responses: torch.Tensor,
+ next_obs_ids: torch.Tensor = None) -> Dict:
+ """Update right side state."""
+ if next_obs_ids != None:
+ responses, responses_with_info_mask = self._info_masked_concatenate_with_padding(
+ right_side['responses'],
+ right_side['responses_with_info_mask'],
+ cur_responses,
+ next_obs_ids,
+ pad_to_left=False
+ )
+ else:
+ responses, responses_with_info_mask = self._info_masked_concatenate_with_padding(
+ right_side['responses'],
+ right_side['responses_with_info_mask'],
+ cur_responses,
+ pad_to_left=False
+ )
+ effective_len = self.tensor_fn.create_attention_mask(responses).sum(dim=1).max()
+ max_len = min(self.config.max_prompt_length, effective_len)
+
+ return {'responses': responses[:, :max_len], 'responses_with_info_mask': responses_with_info_mask[:, :max_len]}
+
+ def _generate_with_gpu_padding(self, active_batch: DataProto) -> DataProto:
+ """
+ Wrapper for generation that handles multi-GPU padding requirements.
+ if num_gpus <= 1, return self.actor_rollout_wg.generate_sequences(active_batch)
+ if active_batch size is not divisible by num_gpus, pad with first sequence
+ then remove padding from output
+ """
+ num_gpus = self.config.num_gpus
+ if num_gpus <= 1:
+ return self.actor_rollout_wg.generate_sequences(active_batch)
+
+ batch_size = active_batch.batch['input_ids'].shape[0]
+ remainder = batch_size % num_gpus
+
+ for key in active_batch.batch.keys():
+ active_batch.batch[key] = active_batch.batch[key].long()
+ if remainder == 0:
+ return self.actor_rollout_wg.generate_sequences(active_batch)
+
+ # Add padding sequences
+ padding_size = num_gpus - remainder
+ padded_batch = {}
+
+ for k, v in active_batch.batch.items():
+ # Use first sequence as padding template
+ pad_sequence = v[0:1].repeat(padding_size, *[1] * (len(v.shape) - 1))
+ padded_batch[k] = torch.cat([v, pad_sequence], dim=0)
+
+ padded_active_batch = DataProto.from_dict(padded_batch)
+ for key in padded_active_batch.batch.keys():
+ padded_active_batch.batch[key] = padded_active_batch.batch[key].long()
+
+ # Generate with padded batch
+ padded_output = self.actor_rollout_wg.generate_sequences(padded_active_batch)
+
+ # Remove padding from output
+ trimmed_batch = {k: v[:-padding_size] for k, v in padded_output.batch.items()}
+
+ # Handle meta_info if present
+ if hasattr(padded_output, 'meta_info') and padded_output.meta_info:
+ trimmed_meta = {}
+ for k, v in padded_output.meta_info.items():
+ if isinstance(v, torch.Tensor):
+ trimmed_meta[k] = v[:-padding_size]
+ else:
+ trimmed_meta[k] = v
+ padded_output.meta_info = trimmed_meta
+
+ padded_output.batch = trimmed_batch
+ return padded_output
+
+ def run_llm_loop(self, gen_batch, initial_input_ids: torch.Tensor) -> Tuple[Dict, Dict]:
+ """Run main LLM generation loop."""
+
+ original_left_side = {'input_ids': initial_input_ids[:, -self.config.max_start_length:]}
+ original_right_side = {'responses': initial_input_ids[:, []], 'responses_with_info_mask': initial_input_ids[:, []]}
+
+ active_mask = torch.ones(gen_batch.batch['input_ids'].shape[0], dtype=torch.bool)
+ turns_stats = torch.ones(gen_batch.batch['input_ids'].shape[0], dtype=torch.int)
+ valid_action_stats = torch.zeros(gen_batch.batch['input_ids'].shape[0], dtype=torch.int)
+ valid_search_stats = torch.zeros(gen_batch.batch['input_ids'].shape[0], dtype=torch.int)
+ active_num_list = [active_mask.sum().item()]
+ rollings = gen_batch
+
+ # Main generation loop
+ for step in range(self.config.max_turns):
+ if not active_mask.sum():
+ break
+ rollings.batch = self.tensor_fn.cut_to_effective_len(
+ rollings.batch,
+ keys=['input_ids', 'attention_mask', 'position_ids']
+ )
+
+ # gen_output = self.actor_rollout_wg.generate_sequences(rollings)
+ rollings_active = DataProto.from_dict({
+ k: v[active_mask] for k, v in rollings.batch.items()
+ })
+ gen_output = self._generate_with_gpu_padding(rollings_active)
+
+ meta_info = gen_output.meta_info
+ responses_ids, responses_str = self._postprocess_responses(gen_output.batch['responses'])
+ responses_ids, responses_str = self.tensor_fn._example_level_pad(responses_ids, responses_str, active_mask)
+
+ # Execute in environment and process observations
+ next_obs, dones, valid_action, is_search = self.execute_predictions(
+ responses_str, self.tokenizer.pad_token, active_mask
+ )
+
+ curr_active_mask = torch.tensor([not done for done in dones], dtype=torch.bool)
+ active_mask = active_mask * curr_active_mask
+ active_num_list.append(active_mask.sum().item())
+ turns_stats[curr_active_mask] += 1
+ valid_action_stats += torch.tensor(valid_action, dtype=torch.int)
+ valid_search_stats += torch.tensor(is_search, dtype=torch.int)
+
+ next_obs_ids = self._process_next_obs(next_obs)
+
+ # Update states
+ rollings = self._update_rolling_state(
+ rollings,
+ responses_ids,
+ next_obs_ids
+ )
+ original_right_side = self._update_right_side(
+ original_right_side,
+ responses_ids,
+ next_obs_ids
+ )
+
+ # final LLM rollout
+ if active_mask.sum():
+ rollings.batch = self.tensor_fn.cut_to_effective_len(
+ rollings.batch,
+ keys=['input_ids', 'attention_mask', 'position_ids']
+ )
+
+ # gen_output = self.actor_rollout_wg.generate_sequences(rollings)
+ rollings_active = DataProto.from_dict({
+ k: v[active_mask] for k, v in rollings.batch.items()
+ })
+ gen_output = self._generate_with_gpu_padding(rollings_active)
+
+ meta_info = gen_output.meta_info
+ responses_ids, responses_str = self._postprocess_responses(gen_output.batch['responses'])
+ responses_ids, responses_str = self.tensor_fn._example_level_pad(responses_ids, responses_str, active_mask)
+
+ # # Execute in environment and process observations
+ _, dones, valid_action, is_search = self.execute_predictions(
+ responses_str, self.tokenizer.pad_token, active_mask, do_search=False
+ )
+
+ curr_active_mask = torch.tensor([not done for done in dones], dtype=torch.bool)
+ active_mask = active_mask * curr_active_mask
+ active_num_list.append(active_mask.sum().item())
+ valid_action_stats += torch.tensor(valid_action, dtype=torch.int)
+ valid_search_stats += torch.tensor(is_search, dtype=torch.int)
+
+
+ original_right_side = self._update_right_side(
+ original_right_side,
+ responses_ids,
+ )
+
+ meta_info['turns_stats'] = turns_stats.tolist()
+ meta_info['active_mask'] = active_mask.tolist()
+ meta_info['valid_action_stats'] = valid_action_stats.tolist()
+ meta_info['valid_search_stats'] = valid_search_stats.tolist()
+
+ print("ACTIVE_TRAJ_NUM:", active_num_list)
+
+ return self._compose_final_output(original_left_side, original_right_side, meta_info)
+
+ def _compose_final_output(self, left_side: Dict,
+ right_side: Dict,
+ meta_info: Dict) -> Tuple[Dict, Dict]:
+ """Compose final generation output."""
+ final_output = right_side.copy()
+ final_output['prompts'] = left_side['input_ids']
+
+ # Combine input IDs
+ final_output['input_ids'] = torch.cat([
+ left_side['input_ids'],
+ right_side['responses']
+ ], dim=1)
+
+ # Create attention mask and position ids
+ final_output['attention_mask'] = torch.cat([
+ self.tensor_fn.create_attention_mask(left_side['input_ids']),
+ self.tensor_fn.create_attention_mask(final_output['responses'])
+ ], dim=1)
+ final_output['info_mask'] = torch.cat([
+ self.tensor_fn.create_attention_mask(left_side['input_ids']),
+ self.tensor_fn.create_attention_mask(final_output['responses_with_info_mask'])
+ ], dim=1)
+
+ final_output['position_ids'] = self.tensor_fn.create_position_ids(
+ final_output['attention_mask']
+ )
+
+ final_output = DataProto.from_dict(final_output)
+ final_output.meta_info.update(meta_info)
+
+ return final_output
+
+ def execute_predictions(self, predictions: List[str], pad_token: str, active_mask=None, do_search=True) -> List[str]:
+ """
+ Execute predictions across multiple environments.
+ NOTE: the function is the actual `step` function in the environment
+ NOTE penalty_for_invalid is not included in observation shown to the LLM
+
+ Args:
+ envs: List of environment instances
+ predictions: List of action predictions
+ pad_token: Token to use for padding
+
+ Returns:
+ List of observation strings
+ """
+ cur_actions, contents = self.postprocess_predictions(predictions)
+ next_obs, dones, valid_action, is_search = [], [], [], []
+
+ search_queries = [content for action, content in zip(cur_actions, contents) if action == 'search']
+ if do_search:
+ search_results = self.batch_search(search_queries)
+ assert len(search_results) == sum([1 for action in cur_actions if action == 'search'])
+ else:
+ search_results = [''] * sum([1 for action in cur_actions if action == 'search'])
+
+ for i, (action, active) in enumerate(zip(cur_actions, active_mask)):
+
+ if not active:
+ next_obs.append('')
+ dones.append(1)
+ valid_action.append(0)
+ is_search.append(0)
+ else:
+ if action == 'answer':
+ next_obs.append('')
+ dones.append(1)
+ valid_action.append(1)
+ is_search.append(0)
+ elif action == 'search':
+ next_obs.append(f'\n\n{search_results.pop(0).strip()}\n\n')
+ dones.append(0)
+ valid_action.append(1)
+ is_search.append(1)
+ else:
+ next_obs.append(f'\nMy previous action is invalid. \
+If I want to search, I should put the query between and . \
+If I want to give the final answer, I should put the answer between and . Let me try again.\n')
+ dones.append(0)
+ valid_action.append(0)
+ is_search.append(0)
+
+ assert len(search_results) == 0
+
+ return next_obs, dones, valid_action, is_search
+
+ def postprocess_predictions(self, predictions: List[Any]) -> Tuple[List[int], List[bool]]:
+ """
+ Process (text-based) predictions from llm into actions and validity flags.
+
+ Args:
+ predictions: List of raw predictions
+
+ Returns:
+ Tuple of (actions list, validity flags list)
+ """
+ actions = []
+ contents = []
+
+ for prediction in predictions:
+ if isinstance(prediction, str): # for llm output
+ pattern = r'<(search|answer)>(.*?)\1>'
+ match = re.search(pattern, prediction, re.DOTALL)
+ if match:
+ content = match.group(2).strip() # Return only the content inside the tags
+ action = match.group(1)
+ else:
+ content = ''
+ action = None
+ else:
+ raise ValueError(f"Invalid prediction type: {type(prediction)}")
+
+ actions.append(action)
+ contents.append(content)
+
+ return actions, contents
+
+ def batch_search(self, queries: List[str] = None) -> str:
+ """
+ Batchified search for queries.
+ Args:
+ queries: queries to call the search engine
+ Returns:
+ search results which is concatenated into a string
+ """
+ results = self._batch_search(queries)['result']
+
+ return [self._passages2string(result) for result in results]
+
+ def _batch_search(self, queries):
+
+ payload = {
+ "queries": queries,
+ "topk": self.config.topk,
+ "return_scores": True
+ }
+
+ return requests.post(self.config.search_url, json=payload).json()
+
+ def _passages2string(self, retrieval_result):
+ format_reference = ''
+ for idx, doc_item in enumerate(retrieval_result):
+
+ content = doc_item['document']['contents']
+ title = content.split("\n")[0]
+ text = "\n".join(content.split("\n")[1:])
+ format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
+
+ return format_reference
diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a7c7c084c4f952533f43b214f987db81075255
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py
@@ -0,0 +1,75 @@
+import torch
+from typing import Dict, Tuple, List
+from dataclasses import dataclass
+
+@dataclass
+class TensorConfig:
+ pad_token_id: int
+ max_prompt_length: int
+ max_obs_length: int
+ max_start_length: int
+
+class TensorHelper:
+ def __init__(self, config: TensorConfig):
+ self.config = config
+
+ def cut_to_effective_len(self, tensor_dict: Dict[str, torch.Tensor],
+ keys: List[str], cut_left: bool = True) -> Dict[str, torch.Tensor]:
+ """Cut tensors to their effective length based on attention mask."""
+ effective_len = tensor_dict['attention_mask'].sum(dim=1).max()
+ result = tensor_dict.copy()
+
+ for key in keys:
+ if cut_left:
+ result[key] = tensor_dict[key][:, -effective_len:]
+ else:
+ result[key] = tensor_dict[key][:, :effective_len]
+ return result
+
+ def convert_pad_structure(self, tensor: torch.Tensor, pad_to_left: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Convert padding structure and return sorted tensor with indices."""
+ mask = tensor != self.config.pad_token_id if pad_to_left else tensor == self.config.pad_token_id
+ sorted_indices = mask.to(torch.int64).argsort(dim=1, stable=True)
+ return tensor.gather(1, sorted_indices), sorted_indices
+
+ def create_attention_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
+ """Create attention mask from input ids."""
+ return torch.where(input_ids != self.config.pad_token_id, 1, 0)
+
+ def create_position_ids(self, attention_mask: torch.Tensor) -> torch.Tensor:
+ """Create position ids from attention mask."""
+ return (torch.cumsum(attention_mask, dim=1) - 1) * attention_mask
+
+ def concatenate_with_padding(self, tensors: List[torch.Tensor],
+ pad_to_left: bool = True) -> torch.Tensor:
+ """Concatenate tensors and handle padding."""
+ concatenated = torch.cat(tensors, dim=1)
+ padded_tensor, _ = self.convert_pad_structure(concatenated, pad_to_left)
+ return padded_tensor
+
+ def _example_level_pad(self, responses: torch.Tensor,
+ responses_str: List[str],
+ active_mask: torch.Tensor) -> Tuple[torch.Tensor, List[str]]:
+ """
+ Pad responses for non-active examples with pad tokens.
+ """
+ assert active_mask.sum() == responses.shape[0]
+ # Create masked responses tensor
+ batch_size = active_mask.shape[0]
+ seq_len = responses.shape[1]
+ padded_responses = torch.full(
+ (batch_size, seq_len), self.config.pad_token_id,
+ dtype=responses.dtype, device=responses.device
+ )
+ padded_responses[active_mask] = responses
+
+ # Create masked response strings
+ padded_responses_str = [""] * batch_size
+
+ s = 0
+ for i, is_active in enumerate(active_mask):
+ if is_active:
+ padded_responses_str[i] = responses_str[s]
+ s += 1
+
+ return padded_responses, padded_responses_str
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh b/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh
new file mode 100644
index 0000000000000000000000000000000000000000..05556a3939471d956360bc1f91d7043e19c73a85
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh
@@ -0,0 +1,19 @@
+
+corpus_file=/your/corpus/jsonl/file # jsonl
+save_dir=/the/path/to/save/index
+retriever_name=e5 # this is for indexing naming
+retriever_model=intfloat/e5-base-v2
+
+# change faiss_type to HNSW32/64/128 for ANN indexing
+# change retriever_name to bm25 for BM25 indexing
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python index_builder.py \
+ --retrieval_method $retriever_name \
+ --model_path $retriever_model \
+ --corpus_path $corpus_file \
+ --save_dir $save_dir \
+ --use_fp16 \
+ --max_length 256 \
+ --batch_size 512 \
+ --pooling_method mean \
+ --faiss_type Flat \
+ --save_embedding
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py b/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad72aeefae69d0796f137557ad8f3bb0d2381be6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py
@@ -0,0 +1,202 @@
+import os
+import re
+import requests
+import argparse
+import asyncio
+import random
+from typing import List, Optional, Dict
+from concurrent.futures import ThreadPoolExecutor
+
+import chardet
+import aiohttp
+import bs4
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+from googleapiclient.discovery import build
+
+
+# --- CLI Args ---
+parser = argparse.ArgumentParser(description="Launch online search server.")
+parser.add_argument('--api_key', type=str, required=True, help="API key for Google search")
+parser.add_argument('--cse_id', type=str, required=True, help="CSE ID for Google search")
+parser.add_argument('--topk', type=int, default=3, help="Number of results to return per query")
+parser.add_argument('--snippet_only', action='store_true', help="If set, only return snippets; otherwise, return full context.")
+args = parser.parse_args()
+
+
+# --- Config ---
+class OnlineSearchConfig:
+ def __init__(self, topk: int = 3, api_key: Optional[str] = None, cse_id: Optional[str] = None, snippet_only: bool = False):
+ self.topk = topk
+ self.api_key = api_key
+ self.cse_id = cse_id
+ self.snippet_only = snippet_only
+
+
+# --- Utilities ---
+def parse_snippet(snippet: str) -> List[str]:
+ segments = snippet.split("...")
+ return [s.strip() for s in segments if len(s.strip().split()) > 5]
+
+
+def sanitize_search_query(query: str) -> str:
+ # Remove or replace special characters that might cause issues.
+ # This is a basic example; you might need to add more characters or patterns.
+ sanitized_query = re.sub(r'[^\w\s]', ' ', query) # Replace non-alphanumeric and non-whitespace with spaces.
+ sanitized_query = re.sub(r'[\t\r\f\v\n]', ' ', sanitized_query) # replace tab, return, formfeed, vertical tab with spaces.
+ sanitized_query = re.sub(r'\s+', ' ', sanitized_query).strip() #remove duplicate spaces, and trailing/leading spaces.
+
+ return sanitized_query
+
+
+def filter_links(search_results: List[Dict]) -> List[str]:
+ links = []
+ for result in search_results:
+ for item in result.get("items", []):
+ if "mime" in item:
+ continue
+ ext = os.path.splitext(item["link"])[1]
+ if ext in ["", ".html", ".htm", ".shtml"]:
+ links.append(item["link"])
+ return links
+
+
+async def fetch(session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) -> str:
+ user_agents = [
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P)...",
+ "Mozilla/5.0 AppleWebKit/537.36...",
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +https://www.google.com/bot.html)",
+ ]
+ headers = {"User-Agent": random.choice(user_agents)}
+
+ async with semaphore:
+ try:
+ async with session.get(url, headers=headers) as response:
+ raw = await response.read()
+ detected = chardet.detect(raw)
+ encoding = detected["encoding"] or "utf-8"
+ return raw.decode(encoding, errors="ignore")
+ except (aiohttp.ClientError, asyncio.TimeoutError):
+ return ""
+
+
+async def fetch_all(urls: List[str], limit: int = 8) -> List[str]:
+ semaphore = asyncio.Semaphore(limit)
+ timeout = aiohttp.ClientTimeout(total=5)
+ connector = aiohttp.TCPConnector(limit_per_host=limit, force_close=True)
+
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
+ tasks = [fetch(session, url, semaphore) for url in urls]
+ return await asyncio.gather(*tasks)
+
+
+# --- Search Engine ---
+class OnlineSearchEngine:
+ def __init__(self, config: OnlineSearchConfig):
+ self.config = config
+
+ def collect_context(self, snippet: str, doc: str) -> str:
+ snippets = parse_snippet(snippet)
+ ctx_paras = []
+
+ for s in snippets:
+ pos = doc.replace("\n", " ").find(s)
+ if pos == -1:
+ continue
+ sta = pos
+ while sta > 0 and doc[sta] != "\n":
+ sta -= 1
+ end = pos + len(s)
+ while end < len(doc) and doc[end] != "\n":
+ end += 1
+ para = doc[sta:end].strip()
+ if para not in ctx_paras:
+ ctx_paras.append(para)
+
+ return "\n".join(ctx_paras)
+
+ def fetch_web_content(self, search_results: List[Dict]) -> Dict[str, str]:
+ links = filter_links(search_results)
+ contents = asyncio.run(fetch_all(links))
+ content_dict = {}
+ for html, link in zip(contents, links):
+ soup = bs4.BeautifulSoup(html, "html.parser")
+ text = "\n".join([p.get_text() for p in soup.find_all("p")])
+ content_dict[link] = text
+ return content_dict
+
+ def search(self, search_term: str, num_iter: int = 1) -> List[Dict]:
+ service = build('customsearch', 'v1', developerKey=self.config.api_key)
+ results = []
+ sanitize_search_term = sanitize_search_query(search_term)
+ if search_term.isspace():
+ return results
+ res = service.cse().list(q=sanitize_search_term, cx=self.config.cse_id).execute()
+ results.append(res)
+
+ for _ in range(num_iter - 1):
+ if 'nextPage' not in res.get('queries', {}):
+ break
+ start_idx = res['queries']['nextPage'][0]['startIndex']
+ res = service.cse().list(q=search_term, cx=self.config.cse_id, start=start_idx).execute()
+ results.append(res)
+
+ return results
+
+ def batch_search(self, queries: List[str]) -> List[List[str]]:
+ with ThreadPoolExecutor() as executor:
+ return list(executor.map(self._retrieve_context, queries))
+
+ def _retrieve_context(self, query: str) -> List[str]:
+
+ if self.config.snippet_only:
+ search_results = self.search(query)
+ contexts = []
+ for result in search_results:
+ for item in result.get("items", []):
+ title = item.get("title", "")
+ context = ' '.join(parse_snippet(item.get("snippet", "")))
+ if title != "" or context != "":
+ title = "No title." if not title else title
+ context = "No snippet available." if not context else context
+ contexts.append({
+ 'document': {"contents": f'\"{title}\"\n{context}'},
+ })
+ else:
+ content_dict = self.fetch_web_content(search_results)
+ contexts = []
+ for result in search_results:
+ for item in result.get("items", []):
+ link = item["link"]
+ title = item.get("title", "")
+ snippet = item.get("snippet", "")
+ if link in content_dict:
+ context = self.collect_context(snippet, content_dict[link])
+ if title != "" or context != "":
+ title = "No title." if not title else title
+ context = "No snippet available." if not context else context
+ contexts.append({
+ 'document': {"contents": f'\"{title}\"\n{context}'},
+ })
+
+ return contexts[:self.config.topk]
+
+
+# --- FastAPI App ---
+app = FastAPI(title="Online Search Proxy Server")
+
+class SearchRequest(BaseModel):
+ queries: List[str]
+
+config = OnlineSearchConfig(api_key=args.api_key, cse_id=args.cse_id, topk=args.topk, snippet_only=args.snippet_only)
+engine = OnlineSearchEngine(config)
+
+@app.post("/retrieve")
+def search_endpoint(request: SearchRequest):
+ results = engine.batch_search(request.queries)
+ return {"result": results}
+
+
+if __name__ == "__main__":
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py b/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cba65a65e3656fd6787b5a1fe024c33c630fcaf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py
@@ -0,0 +1,349 @@
+import os
+import faiss
+import json
+import warnings
+import numpy as np
+from typing import cast, List, Dict
+import shutil
+import subprocess
+import argparse
+import torch
+from tqdm import tqdm
+# from LongRAG.retriever.utils import load_model, load_corpus, pooling
+import datasets
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+
+
+def load_model(
+ model_path: str,
+ use_fp16: bool = False
+ ):
+ model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+ model.eval()
+ model.cuda()
+ if use_fp16:
+ model = model.half()
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+
+ return model, tokenizer
+
+
+def pooling(
+ pooler_output,
+ last_hidden_state,
+ attention_mask = None,
+ pooling_method = "mean"
+ ):
+ if pooling_method == "mean":
+ last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+ elif pooling_method == "cls":
+ return last_hidden_state[:, 0]
+ elif pooling_method == "pooler":
+ return pooler_output
+ else:
+ raise NotImplementedError("Pooling method not implemented!")
+
+
+def load_corpus(corpus_path: str):
+ corpus = datasets.load_dataset(
+ 'json',
+ data_files=corpus_path,
+ split="train",
+ num_proc=4)
+ return corpus
+
+
+class Index_Builder:
+ r"""A tool class used to build an index used in retrieval.
+
+ """
+ def __init__(
+ self,
+ retrieval_method,
+ model_path,
+ corpus_path,
+ save_dir,
+ max_length,
+ batch_size,
+ use_fp16,
+ pooling_method,
+ faiss_type=None,
+ embedding_path=None,
+ save_embedding=False,
+ faiss_gpu=False
+ ):
+
+ self.retrieval_method = retrieval_method.lower()
+ self.model_path = model_path
+ self.corpus_path = corpus_path
+ self.save_dir = save_dir
+ self.max_length = max_length
+ self.batch_size = batch_size
+ self.use_fp16 = use_fp16
+ self.pooling_method = pooling_method
+ self.faiss_type = faiss_type if faiss_type is not None else 'Flat'
+ self.embedding_path = embedding_path
+ self.save_embedding = save_embedding
+ self.faiss_gpu = faiss_gpu
+
+ self.gpu_num = torch.cuda.device_count()
+ # prepare save dir
+ print(self.save_dir)
+ if not os.path.exists(self.save_dir):
+ os.makedirs(self.save_dir)
+ else:
+ if not self._check_dir(self.save_dir):
+ warnings.warn("Some files already exists in save dir and may be overwritten.", UserWarning)
+
+ self.index_save_path = os.path.join(self.save_dir, f"{self.retrieval_method}_{self.faiss_type}.index")
+
+ self.embedding_save_path = os.path.join(self.save_dir, f"emb_{self.retrieval_method}.memmap")
+
+ self.corpus = load_corpus(self.corpus_path)
+
+ print("Finish loading...")
+ @staticmethod
+ def _check_dir(dir_path):
+ r"""Check if the dir path exists and if there is content.
+
+ """
+
+ if os.path.isdir(dir_path):
+ if len(os.listdir(dir_path)) > 0:
+ return False
+ else:
+ os.makedirs(dir_path, exist_ok=True)
+ return True
+
+ def build_index(self):
+ r"""Constructing different indexes based on selective retrieval method.
+
+ """
+ if self.retrieval_method == "bm25":
+ self.build_bm25_index()
+ else:
+ self.build_dense_index()
+
+ def build_bm25_index(self):
+ """Building BM25 index based on Pyserini library.
+
+ Reference: https://github.com/castorini/pyserini/blob/master/docs/usage-index.md#building-a-bm25-index-direct-java-implementation
+ """
+
+ # to use pyserini pipeline, we first need to place jsonl file in the folder
+ self.save_dir = os.path.join(self.save_dir, "bm25")
+ os.makedirs(self.save_dir, exist_ok=True)
+ temp_dir = self.save_dir + "/temp"
+ temp_file_path = temp_dir + "/temp.jsonl"
+ os.makedirs(temp_dir)
+
+ # if self.have_contents:
+ # shutil.copyfile(self.corpus_path, temp_file_path)
+ # else:
+ # with open(temp_file_path, "w") as f:
+ # for item in self.corpus:
+ # f.write(json.dumps(item) + "\n")
+ shutil.copyfile(self.corpus_path, temp_file_path)
+
+ print("Start building bm25 index...")
+ pyserini_args = ["--collection", "JsonCollection",
+ "--input", temp_dir,
+ "--index", self.save_dir,
+ "--generator", "DefaultLuceneDocumentGenerator",
+ "--threads", "1"]
+
+ subprocess.run(["python", "-m", "pyserini.index.lucene"] + pyserini_args)
+
+ shutil.rmtree(temp_dir)
+
+ print("Finish!")
+
+ def _load_embedding(self, embedding_path, corpus_size, hidden_size):
+ all_embeddings = np.memmap(
+ embedding_path,
+ mode="r",
+ dtype=np.float32
+ ).reshape(corpus_size, hidden_size)
+ return all_embeddings
+
+ def _save_embedding(self, all_embeddings):
+ memmap = np.memmap(
+ self.embedding_save_path,
+ shape=all_embeddings.shape,
+ mode="w+",
+ dtype=all_embeddings.dtype
+ )
+ length = all_embeddings.shape[0]
+ # add in batch
+ save_batch_size = 10000
+ if length > save_batch_size:
+ for i in tqdm(range(0, length, save_batch_size), leave=False, desc="Saving Embeddings"):
+ j = min(i + save_batch_size, length)
+ memmap[i: j] = all_embeddings[i: j]
+ else:
+ memmap[:] = all_embeddings
+
+ def encode_all(self):
+ if self.gpu_num > 1:
+ print("Use multi gpu!")
+ self.encoder = torch.nn.DataParallel(self.encoder)
+ self.batch_size = self.batch_size * self.gpu_num
+
+ all_embeddings = []
+
+ for start_idx in tqdm(range(0, len(self.corpus), self.batch_size), desc='Inference Embeddings:'):
+
+ # batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title']
+ # batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text']
+ # batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)]
+ batch_data = self.corpus[start_idx:start_idx+self.batch_size]['contents']
+
+ if self.retrieval_method == "e5":
+ batch_data = [f"passage: {doc}" for doc in batch_data]
+
+ inputs = self.tokenizer(
+ batch_data,
+ padding=True,
+ truncation=True,
+ return_tensors='pt',
+ max_length=self.max_length,
+ ).to('cuda')
+
+ inputs = {k: v.cuda() for k, v in inputs.items()}
+
+ #TODO: support encoder-only T5 model
+ if "T5" in type(self.encoder).__name__:
+ # T5-based retrieval model
+ decoder_input_ids = torch.zeros(
+ (inputs['input_ids'].shape[0], 1), dtype=torch.long
+ ).to(inputs['input_ids'].device)
+ output = self.encoder(
+ **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+ )
+ embeddings = output.last_hidden_state[:, 0, :]
+
+ else:
+ output = self.encoder(**inputs, return_dict=True)
+ embeddings = pooling(output.pooler_output,
+ output.last_hidden_state,
+ inputs['attention_mask'],
+ self.pooling_method)
+ if "dpr" not in self.retrieval_method:
+ embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
+
+ embeddings = cast(torch.Tensor, embeddings)
+ embeddings = embeddings.detach().cpu().numpy()
+ all_embeddings.append(embeddings)
+
+ all_embeddings = np.concatenate(all_embeddings, axis=0)
+ all_embeddings = all_embeddings.astype(np.float32)
+
+ return all_embeddings
+
+ @torch.no_grad()
+ def build_dense_index(self):
+ """Obtain the representation of documents based on the embedding model(BERT-based) and
+ construct a faiss index.
+ """
+
+ if os.path.exists(self.index_save_path):
+ print("The index file already exists and will be overwritten.")
+
+ self.encoder, self.tokenizer = load_model(model_path = self.model_path,
+ use_fp16 = self.use_fp16)
+ if self.embedding_path is not None:
+ hidden_size = self.encoder.config.hidden_size
+ corpus_size = len(self.corpus)
+ all_embeddings = self._load_embedding(self.embedding_path, corpus_size, hidden_size)
+ else:
+ all_embeddings = self.encode_all()
+ if self.save_embedding:
+ self._save_embedding(all_embeddings)
+ del self.corpus
+
+ # build index
+ print("Creating index")
+ dim = all_embeddings.shape[-1]
+ faiss_index = faiss.index_factory(dim, self.faiss_type, faiss.METRIC_INNER_PRODUCT)
+
+ if self.faiss_gpu:
+ co = faiss.GpuMultipleClonerOptions()
+ co.useFloat16 = True
+ co.shard = True
+ faiss_index = faiss.index_cpu_to_all_gpus(faiss_index, co)
+ if not faiss_index.is_trained:
+ faiss_index.train(all_embeddings)
+ faiss_index.add(all_embeddings)
+ faiss_index = faiss.index_gpu_to_cpu(faiss_index)
+ else:
+ if not faiss_index.is_trained:
+ faiss_index.train(all_embeddings)
+ faiss_index.add(all_embeddings)
+
+ faiss.write_index(faiss_index, self.index_save_path)
+ print("Finish!")
+
+
+MODEL2POOLING = {
+ "e5": "mean",
+ "bge": "cls",
+ "contriever": "mean",
+ 'jina': 'mean'
+}
+
+
+def main():
+ parser = argparse.ArgumentParser(description = "Creating index.")
+
+ # Basic parameters
+ parser.add_argument('--retrieval_method', type=str)
+ parser.add_argument('--model_path', type=str, default=None)
+ parser.add_argument('--corpus_path', type=str)
+ parser.add_argument('--save_dir', default= 'indexes/',type=str)
+
+ # Parameters for building dense index
+ parser.add_argument('--max_length', type=int, default=180)
+ parser.add_argument('--batch_size', type=int, default=512)
+ parser.add_argument('--use_fp16', default=False, action='store_true')
+ parser.add_argument('--pooling_method', type=str, default=None)
+ parser.add_argument('--faiss_type',default=None,type=str)
+ parser.add_argument('--embedding_path', default=None, type=str)
+ parser.add_argument('--save_embedding', action='store_true', default=False)
+ parser.add_argument('--faiss_gpu', default=False, action='store_true')
+
+ args = parser.parse_args()
+
+ if args.pooling_method is None:
+ pooling_method = 'mean'
+ for k,v in MODEL2POOLING.items():
+ if k in args.retrieval_method.lower():
+ pooling_method = v
+ break
+ else:
+ if args.pooling_method not in ['mean','cls','pooler']:
+ raise NotImplementedError
+ else:
+ pooling_method = args.pooling_method
+
+
+ index_builder = Index_Builder(
+ retrieval_method = args.retrieval_method,
+ model_path = args.model_path,
+ corpus_path = args.corpus_path,
+ save_dir = args.save_dir,
+ max_length = args.max_length,
+ batch_size = args.batch_size,
+ use_fp16 = args.use_fp16,
+ pooling_method = pooling_method,
+ faiss_type = args.faiss_type,
+ embedding_path = args.embedding_path,
+ save_embedding = args.save_embedding,
+ faiss_gpu = args.faiss_gpu
+ )
+ index_builder.build_index()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py b/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..9edabe881bbc685786d6dde292ae8e72b0216aae
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py
@@ -0,0 +1,161 @@
+import argparse
+from collections import defaultdict
+from typing import Optional
+from dataclasses import dataclass, field
+
+from sentence_transformers import CrossEncoder
+import torch
+from transformers import HfArgumentParser
+import numpy as np
+
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+
+class BaseCrossEncoder:
+ def __init__(self, model, batch_size=32, device="cuda"):
+ self.model = model
+ self.batch_size = batch_size
+ self.model.to(device)
+
+ def _passage_to_string(self, doc_item):
+ if "document" not in doc_item:
+ content = doc_item['contents']
+ else:
+ content = doc_item['document']['contents']
+ title = content.split("\n")[0]
+ text = "\n".join(content.split("\n")[1:])
+
+ return f"(Title: {title}) {text}"
+
+ def rerank(self,
+ queries: list[str],
+ documents: list[list[dict]]):
+ """
+ Assume documents is a list of list of dicts, where each dict is a document with keys "id" and "contents".
+ This asumption is made to be consistent with the output of the retrieval server.
+ """
+ assert len(queries) == len(documents)
+
+ pairs = []
+ qids = []
+ for qid, query in enumerate(queries):
+ for document in documents:
+ for doc_item in document:
+ doc = self._passage_to_string(doc_item)
+ pairs.append((query, doc))
+ qids.append(qid)
+
+ scores = self._predict(pairs)
+ query_to_doc_scores = defaultdict(list)
+
+ assert len(scores) == len(pairs) == len(qids)
+ for i in range(len(pairs)):
+ query, doc = pairs[i]
+ score = scores[i]
+ qid = qids[i]
+ query_to_doc_scores[qid].append((doc, score))
+
+ sorted_query_to_doc_scores = {}
+ for query, doc_scores in query_to_doc_scores.items():
+ sorted_query_to_doc_scores[query] = sorted(doc_scores, key=lambda x: x[1], reverse=True)
+
+ return sorted_query_to_doc_scores
+
+ def _predict(self, pairs: list[tuple[str, str]]):
+ raise NotImplementedError
+
+ @classmethod
+ def load(cls, model_name_or_path, **kwargs):
+ raise NotImplementedError
+
+
+class SentenceTransformerCrossEncoder(BaseCrossEncoder):
+ def __init__(self, model, batch_size=32, device="cuda"):
+ super().__init__(model, batch_size, device)
+
+ def _predict(self, pairs: list[tuple[str, str]]):
+ scores = self.model.predict(pairs, batch_size=self.batch_size)
+ scores = scores.tolist() if isinstance(scores, torch.Tensor) or isinstance(scores, np.ndarray) else scores
+ return scores
+
+ @classmethod
+ def load(cls, model_name_or_path, **kwargs):
+ model = CrossEncoder(model_name_or_path)
+ return cls(model, **kwargs)
+
+
+class RerankRequest(BaseModel):
+ queries: list[str]
+ documents: list[list[dict]]
+ rerank_topk: Optional[int] = None
+ return_scores: bool = False
+
+
+@dataclass
+class RerankerArguments:
+ max_length: int = field(default=512)
+ rerank_topk: int = field(default=3)
+ rerank_model_name_or_path: str = field(default="cross-encoder/ms-marco-MiniLM-L12-v2")
+ batch_size: int = field(default=32)
+ reranker_type: str = field(default="sentence_transformer")
+
+def get_reranker(config):
+ if config.reranker_type == "sentence_transformer":
+ return SentenceTransformerCrossEncoder.load(
+ config.rerank_model_name_or_path,
+ batch_size=config.batch_size,
+ device="cuda" if torch.cuda.is_available() else "cpu"
+ )
+ else:
+ raise ValueError(f"Unknown reranker type: {config.reranker_type}")
+
+
+app = FastAPI()
+
+@app.post("/rerank")
+def rerank_endpoint(request: RerankRequest):
+ """
+ Endpoint that accepts queries and performs retrieval.
+ Input format:
+ {
+ "queries": ["What is Python?", "Tell me about neural networks."],
+ "documents": [[doc_item_1, ..., doc_item_k], [doc_item_1, ..., doc_item_k]],
+ "rerank_topk": 3,
+ "return_scores": true
+ }
+ """
+ if not request.rerank_topk:
+ request.rerank_topk = config.rerank_topk # fallback to default
+
+ # Perform batch re reranking
+ # doc_scores already sorted by score
+ query_to_doc_scores = reranker.rerank(request.queries, request.documents)
+
+ # Format response
+ resp = []
+ for _, doc_scores in query_to_doc_scores.items():
+ doc_scores = doc_scores[:request.rerank_topk]
+ if request.return_scores:
+ combined = []
+ for doc, score in doc_scores:
+ combined.append({"document": doc, "score": score})
+ resp.append(combined)
+ else:
+ resp.append([doc for doc, _ in doc_scores])
+ return {"result": resp}
+
+
+if __name__ == "__main__":
+
+ # 1) Build a config (could also parse from arguments).
+ # In real usage, you'd parse your CLI arguments or environment variables.
+ parser = HfArgumentParser((RerankerArguments))
+ config = parser.parse_args_into_dataclasses()[0]
+
+ # 2) Instantiate a global retriever so it is loaded once and reused.
+ reranker = get_reranker(config)
+
+ # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+ uvicorn.run(app, host="0.0.0.0", port=6980)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..125643a7bea6e83c612fe6ed02e25ea1a7464670
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py
@@ -0,0 +1,368 @@
+import json
+import os
+import warnings
+from typing import List, Dict
+import functools
+from tqdm import tqdm
+from multiprocessing import Pool
+import faiss
+import torch
+import numpy as np
+from transformers import AutoConfig, AutoTokenizer, AutoModel
+import argparse
+import datasets
+
+
+def load_corpus(corpus_path: str):
+ corpus = datasets.load_dataset(
+ 'json',
+ data_files=corpus_path,
+ split="train",
+ num_proc=4)
+ return corpus
+
+
+def read_jsonl(file_path):
+ data = []
+
+ with open(file_path, "r") as f:
+ readin = f.readlines()
+ for line in readin:
+ data.append(json.loads(line))
+ return data
+
+
+def load_docs(corpus, doc_idxs):
+ results = [corpus[int(idx)] for idx in doc_idxs]
+
+ return results
+
+
+def load_model(
+ model_path: str,
+ use_fp16: bool = False
+ ):
+ model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+ model.eval()
+ model.cuda()
+ if use_fp16:
+ model = model.half()
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+
+ return model, tokenizer
+
+
+def pooling(
+ pooler_output,
+ last_hidden_state,
+ attention_mask = None,
+ pooling_method = "mean"
+ ):
+ if pooling_method == "mean":
+ last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+ elif pooling_method == "cls":
+ return last_hidden_state[:, 0]
+ elif pooling_method == "pooler":
+ return pooler_output
+ else:
+ raise NotImplementedError("Pooling method not implemented!")
+
+
+class Encoder:
+ def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16):
+ self.model_name = model_name
+ self.model_path = model_path
+ self.pooling_method = pooling_method
+ self.max_length = max_length
+ self.use_fp16 = use_fp16
+
+ self.model, self.tokenizer = load_model(model_path=model_path,
+ use_fp16=use_fp16)
+
+ @torch.no_grad()
+ def encode(self, query_list: List[str], is_query=True) -> np.ndarray:
+ # processing query for different encoders
+ if isinstance(query_list, str):
+ query_list = [query_list]
+
+ if "e5" in self.model_name.lower():
+ if is_query:
+ query_list = [f"query: {query}" for query in query_list]
+ else:
+ query_list = [f"passage: {query}" for query in query_list]
+
+ if "bge" in self.model_name.lower():
+ if is_query:
+ query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list]
+
+ inputs = self.tokenizer(query_list,
+ max_length=self.max_length,
+ padding=True,
+ truncation=True,
+ return_tensors="pt"
+ )
+ inputs = {k: v.cuda() for k, v in inputs.items()}
+
+ if "T5" in type(self.model).__name__:
+ # T5-based retrieval model
+ decoder_input_ids = torch.zeros(
+ (inputs['input_ids'].shape[0], 1), dtype=torch.long
+ ).to(inputs['input_ids'].device)
+ output = self.model(
+ **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+ )
+ query_emb = output.last_hidden_state[:, 0, :]
+
+ else:
+ output = self.model(**inputs, return_dict=True)
+ query_emb = pooling(output.pooler_output,
+ output.last_hidden_state,
+ inputs['attention_mask'],
+ self.pooling_method)
+ if "dpr" not in self.model_name.lower():
+ query_emb = torch.nn.functional.normalize(query_emb, dim=-1)
+
+ query_emb = query_emb.detach().cpu().numpy()
+ query_emb = query_emb.astype(np.float32, order="C")
+ return query_emb
+
+
+class BaseRetriever:
+ """Base object for all retrievers."""
+
+ def __init__(self, config):
+ self.config = config
+ self.retrieval_method = config.retrieval_method
+ self.topk = config.retrieval_topk
+
+ self.index_path = config.index_path
+ self.corpus_path = config.corpus_path
+
+ # self.cache_save_path = os.path.join(config.save_dir, 'retrieval_cache.json')
+
+ def _search(self, query: str, num: int, return_score:bool) -> List[Dict[str, str]]:
+ r"""Retrieve topk relevant documents in corpus.
+ Return:
+ list: contains information related to the document, including:
+ contents: used for building index
+ title: (if provided)
+ text: (if provided)
+ """
+ pass
+
+ def _batch_search(self, query_list, num, return_score):
+ pass
+
+ def search(self, *args, **kwargs):
+ return self._search(*args, **kwargs)
+
+ def batch_search(self, *args, **kwargs):
+ return self._batch_search(*args, **kwargs)
+
+
+class BM25Retriever(BaseRetriever):
+ r"""BM25 retriever based on pre-built pyserini index."""
+
+ def __init__(self, config):
+ super().__init__(config)
+ from pyserini.search.lucene import LuceneSearcher
+ self.searcher = LuceneSearcher(self.index_path)
+ self.contain_doc = self._check_contain_doc()
+ if not self.contain_doc:
+ self.corpus = load_corpus(self.corpus_path)
+ self.max_process_num = 8
+
+ def _check_contain_doc(self):
+ r"""Check if the index contains document content
+ """
+ return self.searcher.doc(0).raw() is not None
+
+ def _search(self, query: str, num: int = None, return_score = False) -> List[Dict[str, str]]:
+ if num is None:
+ num = self.topk
+
+ hits = self.searcher.search(query, num)
+ if len(hits) < 1:
+ if return_score:
+ return [],[]
+ else:
+ return []
+
+ scores = [hit.score for hit in hits]
+ if len(hits) < num:
+ warnings.warn('Not enough documents retrieved!')
+ else:
+ hits = hits[:num]
+
+ if self.contain_doc:
+ all_contents = [json.loads(self.searcher.doc(hit.docid).raw())['contents'] for hit in hits]
+ results = [{'title': content.split("\n")[0].strip("\""),
+ 'text': "\n".join(content.split("\n")[1:]),
+ 'contents': content} for content in all_contents]
+ else:
+ results = load_docs(self.corpus, [hit.docid for hit in hits])
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+ def _batch_search(self, query_list, num: int = None, return_score = False):
+ # TODO: modify batch method
+ results = []
+ scores = []
+ for query in query_list:
+ item_result, item_score = self._search(query, num,True)
+ results.append(item_result)
+ scores.append(item_score)
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+def get_available_gpu_memory():
+ memory_info = []
+ for i in range(torch.cuda.device_count()):
+ total_memory = torch.cuda.get_device_properties(i).total_memory
+ allocated_memory = torch.cuda.memory_allocated(i)
+ free_memory = total_memory - allocated_memory
+ memory_info.append((i, free_memory / 1e9)) # Convert to GB
+ return memory_info
+
+
+class DenseRetriever(BaseRetriever):
+ r"""Dense retriever based on pre-built faiss index."""
+
+ def __init__(self, config: dict):
+ super().__init__(config)
+ self.index = faiss.read_index(self.index_path)
+ if config.faiss_gpu:
+ co = faiss.GpuMultipleClonerOptions()
+ co.useFloat16 = True
+ co.shard = True
+ self.index = faiss.index_cpu_to_all_gpus(self.index, co=co)
+ # self.index = faiss.index_cpu_to_all_gpus(self.index)
+
+ self.corpus = load_corpus(self.corpus_path)
+ self.encoder = Encoder(
+ model_name = self.retrieval_method,
+ model_path = config.retrieval_model_path,
+ pooling_method = config.retrieval_pooling_method,
+ max_length = config.retrieval_query_max_length,
+ use_fp16 = config.retrieval_use_fp16
+ )
+ self.topk = config.retrieval_topk
+ self.batch_size = self.config.retrieval_batch_size
+
+ def _search(self, query: str, num: int = None, return_score = False):
+ if num is None:
+ num = self.topk
+ query_emb = self.encoder.encode(query)
+ scores, idxs = self.index.search(query_emb, k=num)
+ idxs = idxs[0]
+ scores = scores[0]
+
+ results = load_docs(self.corpus, idxs)
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+ def _batch_search(self, query_list: List[str], num: int = None, return_score = False):
+ if isinstance(query_list, str):
+ query_list = [query_list]
+ if num is None:
+ num = self.topk
+
+ batch_size = self.batch_size
+
+ results = []
+ scores = []
+
+ for start_idx in tqdm(range(0, len(query_list), batch_size), desc='Retrieval process: '):
+ query_batch = query_list[start_idx:start_idx + batch_size]
+
+ # from time import time
+ # a = time()
+ batch_emb = self.encoder.encode(query_batch)
+ # b = time()
+ # print(f'################### encode time {b-a} #####################')
+ batch_scores, batch_idxs = self.index.search(batch_emb, k=num)
+ batch_scores = batch_scores.tolist()
+ batch_idxs = batch_idxs.tolist()
+ # print(f'################### search time {time()-b} #####################')
+ # exit()
+
+ flat_idxs = sum(batch_idxs, [])
+ batch_results = load_docs(self.corpus, flat_idxs)
+ batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))]
+
+ scores.extend(batch_scores)
+ results.extend(batch_results)
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+def get_retriever(config):
+ r"""Automatically select retriever class based on config's retrieval method
+
+ Args:
+ config (dict): configuration with 'retrieval_method' key
+
+ Returns:
+ Retriever: retriever instance
+ """
+ if config.retrieval_method == "bm25":
+ return BM25Retriever(config)
+ else:
+ return DenseRetriever(config)
+
+
+def get_dataset(config):
+ """Load dataset from config."""
+
+ split_path = os.path.join(config.dataset_path, f'{config.data_split}.jsonl')
+ return read_jsonl(split_path)
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description = "Retrieval")
+
+ # Basic parameters
+ parser.add_argument('--retrieval_method', type=str)
+ parser.add_argument('--retrieval_topk', type=int, default=10)
+ parser.add_argument('--index_path', type=str, default=None)
+ parser.add_argument('--corpus_path', type=str)
+ parser.add_argument('--dataset_path', default=None, type=str)
+
+ parser.add_argument('--faiss_gpu', default=True, type=bool)
+ parser.add_argument('--data_split', default="train", type=str)
+
+ parser.add_argument('--retrieval_model_path', type=str, default=None)
+ parser.add_argument('--retrieval_pooling_method', default='mean', type=str)
+ parser.add_argument('--retrieval_query_max_length', default=256, type=str)
+ parser.add_argument('--retrieval_use_fp16', action='store_true', default=False)
+ parser.add_argument('--retrieval_batch_size', default=512, type=int)
+
+ args = parser.parse_args()
+
+ args.index_path = os.path.join(args.index_path, f'{args.retrieval_method}_Flat.index') if args.retrieval_method != 'bm25' else os.path.join(args.index_path, 'bm25')
+
+ # load dataset
+ all_split = get_dataset(args)
+
+ input_query = [sample['question'] for sample in all_split[:512]]
+
+ # initialize the retriever and conduct retrieval
+ retriever = get_retriever(args)
+ print('Start Retrieving ...')
+ results, scores = retriever.batch_search(input_query, return_score=True)
+
+ # from IPython import embed
+ # embed()
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5326ea2840f3a816540fea28f8b557ae02291248
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh
@@ -0,0 +1,25 @@
+
+DATA_NAME=nq
+
+DATASET_PATH="/home/peterjin/mnt/data/$DATA_NAME"
+
+SPLIT='test'
+TOPK=3
+
+INDEX_PATH=/home/peterjin/mnt/index/wiki-18
+CORPUS_PATH=/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl
+SAVE_NAME=e5_${TOPK}_wiki18.json
+
+# INDEX_PATH=/home/peterjin/rm_retrieval_corpus/index/wiki-21
+# CORPUS_PATH=/home/peterjin/rm_retrieval_corpus/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl
+# SAVE_NAME=e5_${TOPK}_wiki21.json
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python retrieval.py --retrieval_method e5 \
+ --retrieval_topk $TOPK \
+ --index_path $INDEX_PATH \
+ --corpus_path $CORPUS_PATH \
+ --dataset_path $DATASET_PATH \
+ --data_split $SPLIT \
+ --retrieval_model_path "intfloat/e5-base-v2" \
+ --retrieval_pooling_method "mean" \
+ --retrieval_batch_size 512 \
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0a4df6d7adc71c8366938572898c6116276c0e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py
@@ -0,0 +1,23 @@
+import requests
+
+# URL for your local FastAPI server
+url = "http://127.0.0.1:8000/retrieve"
+
+# Example payload
+payload = {
+ "queries": ["What is the capital of France?", "Explain neural networks."] * 200,
+ "topk": 5,
+ "return_scores": True
+}
+
+# Send POST request
+response = requests.post(url, json=payload)
+
+# Raise an exception if the request failed
+response.raise_for_status()
+
+# Get the JSON response
+retrieved_data = response.json()
+
+print("Response from server:")
+print(retrieved_data)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e14f7bcde1c8c50076ccf464e5e5acdc1bdcff
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py
@@ -0,0 +1,123 @@
+# pip install -U sentence-transformers
+import os
+import re
+import argparse
+from dataclasses import dataclass, field
+from typing import List, Optional
+from collections import defaultdict
+
+import torch
+import numpy as np
+from fastapi import FastAPI
+from pydantic import BaseModel
+from sentence_transformers import CrossEncoder
+
+from retrieval_server import get_retriever, Config as RetrieverConfig
+from rerank_server import SentenceTransformerCrossEncoder
+
+app = FastAPI()
+
+def convert_title_format(text):
+ # Use regex to extract the title and the content
+ match = re.match(r'\(Title:\s*([^)]+)\)\s*(.+)', text, re.DOTALL)
+ if match:
+ title, content = match.groups()
+ return f'\"{title}\"\n{content}'
+ else:
+ return text
+
+# ----------- Combined Request Schema -----------
+class SearchRequest(BaseModel):
+ queries: List[str]
+ topk_retrieval: Optional[int] = 10
+ topk_rerank: Optional[int] = 3
+ return_scores: bool = False
+
+# ----------- Reranker Config Schema -----------
+@dataclass
+class RerankerArguments:
+ max_length: int = field(default=512)
+ rerank_topk: int = field(default=3)
+ rerank_model_name_or_path: str = field(default="cross-encoder/ms-marco-MiniLM-L12-v2")
+ batch_size: int = field(default=32)
+ reranker_type: str = field(default="sentence_transformer")
+
+def get_reranker(config):
+ if config.reranker_type == "sentence_transformer":
+ return SentenceTransformerCrossEncoder.load(
+ config.rerank_model_name_or_path,
+ batch_size=config.batch_size,
+ device="cuda" if torch.cuda.is_available() else "cpu"
+ )
+ else:
+ raise ValueError(f"Unknown reranker type: {config.reranker_type}")
+
+# ----------- Endpoint -----------
+@app.post("/retrieve")
+def search_endpoint(request: SearchRequest):
+ # Step 1: Retrieve documents
+ retrieved_docs = retriever.batch_search(
+ query_list=request.queries,
+ num=request.topk_retrieval,
+ return_score=False
+ )
+
+ # Step 2: Rerank
+ reranked = reranker.rerank(request.queries, retrieved_docs)
+
+ # Step 3: Format response
+ response = []
+ for i, doc_scores in reranked.items():
+ doc_scores = doc_scores[:request.topk_rerank]
+ if request.return_scores:
+ combined = []
+ for doc, score in doc_scores:
+ combined.append({"document": convert_title_format(doc), "score": score})
+ response.append(combined)
+ else:
+ response.append([convert_title_format(doc) for doc, _ in doc_scores])
+
+ return {"result": response}
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser(description="Launch the local faiss retriever.")
+ # retriever
+ parser.add_argument("--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file.")
+ parser.add_argument("--corpus_path", type=str, default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl", help="Local corpus file.")
+ parser.add_argument("--retrieval_topk", type=int, default=10, help="Number of retrieved passages for one query.")
+ parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.")
+ parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model.")
+ parser.add_argument('--faiss_gpu', action='store_true', help='Use GPU for computation')
+ # reranker
+ parser.add_argument("--reranking_topk", type=int, default=3, help="Number of reranked passages for one query.")
+ parser.add_argument("--reranker_model", type=str, default="cross-encoder/ms-marco-MiniLM-L12-v2", help="Path of the reranker model.")
+ parser.add_argument("--reranker_batch_size", type=int, default=32, help="Batch size for the reranker inference.")
+
+ args = parser.parse_args()
+
+ # ----------- Load Retriever and Reranker -----------
+ retriever_config = RetrieverConfig(
+ retrieval_method = args.retriever_name,
+ index_path=args.index_path,
+ corpus_path=args.corpus_path,
+ retrieval_topk=args.retrieval_topk,
+ faiss_gpu=args.faiss_gpu,
+ retrieval_model_path=args.retriever_model,
+ retrieval_pooling_method="mean",
+ retrieval_query_max_length=256,
+ retrieval_use_fp16=True,
+ retrieval_batch_size=512,
+ )
+ retriever = get_retriever(retriever_config)
+
+ reranker_config = RerankerArguments(
+ rerank_topk = args.reranking_topk,
+ rerank_model_name_or_path = args.reranker_model,
+ batch_size = args.reranker_batch_size,
+ )
+ reranker = get_reranker(reranker_config)
+
+ import uvicorn
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39698980c1da3abdf715dcdd78916cf1dbdc935
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py
@@ -0,0 +1,392 @@
+import json
+import os
+import warnings
+from typing import List, Dict, Optional
+import argparse
+
+import faiss
+import torch
+import numpy as np
+from transformers import AutoConfig, AutoTokenizer, AutoModel
+from tqdm import tqdm
+import datasets
+
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+def load_corpus(corpus_path: str):
+ corpus = datasets.load_dataset(
+ 'json',
+ data_files=corpus_path,
+ split="train",
+ num_proc=4
+ )
+ return corpus
+
+def read_jsonl(file_path):
+ data = []
+ with open(file_path, "r") as f:
+ for line in f:
+ data.append(json.loads(line))
+ return data
+
+def load_docs(corpus, doc_idxs):
+ results = [corpus[int(idx)] for idx in doc_idxs]
+ return results
+
+def load_model(model_path: str, use_fp16: bool = False):
+ model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+ model.eval()
+ model.cuda()
+ if use_fp16:
+ model = model.half()
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+ return model, tokenizer
+
+def pooling(
+ pooler_output,
+ last_hidden_state,
+ attention_mask = None,
+ pooling_method = "mean"
+):
+ if pooling_method == "mean":
+ last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+ elif pooling_method == "cls":
+ return last_hidden_state[:, 0]
+ elif pooling_method == "pooler":
+ return pooler_output
+ else:
+ raise NotImplementedError("Pooling method not implemented!")
+
+class Encoder:
+ def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16):
+ self.model_name = model_name
+ self.model_path = model_path
+ self.pooling_method = pooling_method
+ self.max_length = max_length
+ self.use_fp16 = use_fp16
+
+ self.model, self.tokenizer = load_model(model_path=model_path, use_fp16=use_fp16)
+ self.model.eval()
+
+ @torch.no_grad()
+ def encode(self, query_list: List[str], is_query=True) -> np.ndarray:
+ # processing query for different encoders
+ if isinstance(query_list, str):
+ query_list = [query_list]
+
+ if "e5" in self.model_name.lower():
+ if is_query:
+ query_list = [f"query: {query}" for query in query_list]
+ else:
+ query_list = [f"passage: {query}" for query in query_list]
+
+ if "bge" in self.model_name.lower():
+ if is_query:
+ query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list]
+
+ inputs = self.tokenizer(query_list,
+ max_length=self.max_length,
+ padding=True,
+ truncation=True,
+ return_tensors="pt"
+ )
+ inputs = {k: v.cuda() for k, v in inputs.items()}
+
+ if "T5" in type(self.model).__name__:
+ # T5-based retrieval model
+ decoder_input_ids = torch.zeros(
+ (inputs['input_ids'].shape[0], 1), dtype=torch.long
+ ).to(inputs['input_ids'].device)
+ output = self.model(
+ **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+ )
+ query_emb = output.last_hidden_state[:, 0, :]
+ else:
+ output = self.model(**inputs, return_dict=True)
+ query_emb = pooling(output.pooler_output,
+ output.last_hidden_state,
+ inputs['attention_mask'],
+ self.pooling_method)
+ if "dpr" not in self.model_name.lower():
+ query_emb = torch.nn.functional.normalize(query_emb, dim=-1)
+
+ query_emb = query_emb.detach().cpu().numpy()
+ query_emb = query_emb.astype(np.float32, order="C")
+
+ del inputs, output
+ torch.cuda.empty_cache()
+
+ return query_emb
+
+class BaseRetriever:
+ def __init__(self, config):
+ self.config = config
+ self.retrieval_method = config.retrieval_method
+ self.topk = config.retrieval_topk
+
+ self.index_path = config.index_path
+ self.corpus_path = config.corpus_path
+
+ def _search(self, query: str, num: int, return_score: bool):
+ raise NotImplementedError
+
+ def _batch_search(self, query_list: List[str], num: int, return_score: bool):
+ raise NotImplementedError
+
+ def search(self, query: str, num: int = None, return_score: bool = False):
+ return self._search(query, num, return_score)
+
+ def batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+ return self._batch_search(query_list, num, return_score)
+
+class BM25Retriever(BaseRetriever):
+ def __init__(self, config):
+ super().__init__(config)
+ from pyserini.search.lucene import LuceneSearcher
+ self.searcher = LuceneSearcher(self.index_path)
+ self.contain_doc = self._check_contain_doc()
+ if not self.contain_doc:
+ self.corpus = load_corpus(self.corpus_path)
+ self.max_process_num = 8
+
+ def _check_contain_doc(self):
+ return self.searcher.doc(0).raw() is not None
+
+ def _search(self, query: str, num: int = None, return_score: bool = False):
+ if num is None:
+ num = self.topk
+ hits = self.searcher.search(query, num)
+ if len(hits) < 1:
+ if return_score:
+ return [], []
+ else:
+ return []
+ scores = [hit.score for hit in hits]
+ if len(hits) < num:
+ warnings.warn('Not enough documents retrieved!')
+ else:
+ hits = hits[:num]
+
+ if self.contain_doc:
+ all_contents = [
+ json.loads(self.searcher.doc(hit.docid).raw())['contents']
+ for hit in hits
+ ]
+ results = [
+ {
+ 'title': content.split("\n")[0].strip("\""),
+ 'text': "\n".join(content.split("\n")[1:]),
+ 'contents': content
+ }
+ for content in all_contents
+ ]
+ else:
+ results = load_docs(self.corpus, [hit.docid for hit in hits])
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+ def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+ results = []
+ scores = []
+ for query in query_list:
+ item_result, item_score = self._search(query, num, True)
+ results.append(item_result)
+ scores.append(item_score)
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+class DenseRetriever(BaseRetriever):
+ def __init__(self, config):
+ super().__init__(config)
+ self.index = faiss.read_index(self.index_path)
+ if config.faiss_gpu:
+ co = faiss.GpuMultipleClonerOptions()
+ co.useFloat16 = True
+ co.shard = True
+ self.index = faiss.index_cpu_to_all_gpus(self.index, co=co)
+
+ self.corpus = load_corpus(self.corpus_path)
+ self.encoder = Encoder(
+ model_name = self.retrieval_method,
+ model_path = config.retrieval_model_path,
+ pooling_method = config.retrieval_pooling_method,
+ max_length = config.retrieval_query_max_length,
+ use_fp16 = config.retrieval_use_fp16
+ )
+ self.topk = config.retrieval_topk
+ self.batch_size = config.retrieval_batch_size
+
+ def _search(self, query: str, num: int = None, return_score: bool = False):
+ if num is None:
+ num = self.topk
+ query_emb = self.encoder.encode(query)
+ scores, idxs = self.index.search(query_emb, k=num)
+ idxs = idxs[0]
+ scores = scores[0]
+ results = load_docs(self.corpus, idxs)
+ if return_score:
+ return results, scores.tolist()
+ else:
+ return results
+
+ def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+ if isinstance(query_list, str):
+ query_list = [query_list]
+ if num is None:
+ num = self.topk
+
+ results = []
+ scores = []
+ for start_idx in tqdm(range(0, len(query_list), self.batch_size), desc='Retrieval process: '):
+ query_batch = query_list[start_idx:start_idx + self.batch_size]
+ batch_emb = self.encoder.encode(query_batch)
+ batch_scores, batch_idxs = self.index.search(batch_emb, k=num)
+ batch_scores = batch_scores.tolist()
+ batch_idxs = batch_idxs.tolist()
+
+ # load_docs is not vectorized, but is a python list approach
+ flat_idxs = sum(batch_idxs, [])
+ batch_results = load_docs(self.corpus, flat_idxs)
+ # chunk them back
+ batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))]
+
+ results.extend(batch_results)
+ scores.extend(batch_scores)
+
+ del batch_emb, batch_scores, batch_idxs, query_batch, flat_idxs, batch_results
+ torch.cuda.empty_cache()
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+def get_retriever(config):
+ if config.retrieval_method == "bm25":
+ return BM25Retriever(config)
+ else:
+ return DenseRetriever(config)
+
+
+#####################################
+# FastAPI server below
+#####################################
+
+class Config:
+ """
+ Minimal config class (simulating your argparse)
+ Replace this with your real arguments or load them dynamically.
+ """
+ def __init__(
+ self,
+ retrieval_method: str = "bm25",
+ retrieval_topk: int = 10,
+ index_path: str = "./index/bm25",
+ corpus_path: str = "./data/corpus.jsonl",
+ dataset_path: str = "./data",
+ data_split: str = "train",
+ faiss_gpu: bool = True,
+ retrieval_model_path: str = "./model",
+ retrieval_pooling_method: str = "mean",
+ retrieval_query_max_length: int = 256,
+ retrieval_use_fp16: bool = False,
+ retrieval_batch_size: int = 128
+ ):
+ self.retrieval_method = retrieval_method
+ self.retrieval_topk = retrieval_topk
+ self.index_path = index_path
+ self.corpus_path = corpus_path
+ self.dataset_path = dataset_path
+ self.data_split = data_split
+ self.faiss_gpu = faiss_gpu
+ self.retrieval_model_path = retrieval_model_path
+ self.retrieval_pooling_method = retrieval_pooling_method
+ self.retrieval_query_max_length = retrieval_query_max_length
+ self.retrieval_use_fp16 = retrieval_use_fp16
+ self.retrieval_batch_size = retrieval_batch_size
+
+
+class QueryRequest(BaseModel):
+ queries: List[str]
+ topk: Optional[int] = None
+ return_scores: bool = False
+
+
+app = FastAPI()
+
+@app.post("/retrieve")
+def retrieve_endpoint(request: QueryRequest):
+ """
+ Endpoint that accepts queries and performs retrieval.
+ Input format:
+ {
+ "queries": ["What is Python?", "Tell me about neural networks."],
+ "topk": 3,
+ "return_scores": true
+ }
+ """
+ if not request.topk:
+ request.topk = config.retrieval_topk # fallback to default
+
+ # Perform batch retrieval
+ results, scores = retriever.batch_search(
+ query_list=request.queries,
+ num=request.topk,
+ return_score=request.return_scores
+ )
+
+ # Format response
+ resp = []
+ for i, single_result in enumerate(results):
+ if request.return_scores:
+ # If scores are returned, combine them with results
+ combined = []
+ for doc, score in zip(single_result, scores[i]):
+ combined.append({"document": doc, "score": score})
+ resp.append(combined)
+ else:
+ resp.append(single_result)
+ return {"result": resp}
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser(description="Launch the local faiss retriever.")
+ parser.add_argument("--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file.")
+ parser.add_argument("--corpus_path", type=str, default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl", help="Local corpus file.")
+ parser.add_argument("--topk", type=int, default=3, help="Number of retrieved passages for one query.")
+ parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.")
+ parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model.")
+ parser.add_argument('--faiss_gpu', action='store_true', help='Use GPU for computation')
+
+ args = parser.parse_args()
+
+ # 1) Build a config (could also parse from arguments).
+ # In real usage, you'd parse your CLI arguments or environment variables.
+ config = Config(
+ retrieval_method = args.retriever_name, # or "dense"
+ index_path=args.index_path,
+ corpus_path=args.corpus_path,
+ retrieval_topk=args.topk,
+ faiss_gpu=args.faiss_gpu,
+ retrieval_model_path=args.retriever_model,
+ retrieval_pooling_method="mean",
+ retrieval_query_max_length=256,
+ retrieval_use_fp16=True,
+ retrieval_batch_size=512,
+ )
+
+ # 2) Instantiate a global retriever so it is loaded once and reused.
+ retriever = get_retriever(config)
+
+ # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py b/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a10de3fa44aa6af20a12417ed9cf215319ad6f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py
@@ -0,0 +1,112 @@
+import os
+import requests
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List, Optional, Dict
+from concurrent.futures import ThreadPoolExecutor
+import argparse
+import uvicorn
+
+parser = argparse.ArgumentParser(description="Launch online search server.")
+parser.add_argument('--search_url', type=str, required=True,
+ help="URL for search engine (e.g. https://serpapi.com/search)")
+parser.add_argument('--topk', type=int, default=3,
+ help="Number of results to return per query")
+parser.add_argument('--serp_api_key', type=str, default=None,
+ help="SerpAPI key for online search")
+parser.add_argument('--serp_engine', type=str, default="google",
+ help="SerpAPI engine for online search")
+args = parser.parse_args()
+
+# --- Config ---
+class OnlineSearchConfig:
+ def __init__(
+ self,
+ search_url: str = "https://serpapi.com/search",
+ topk: int = 3,
+ serp_api_key: Optional[str] = None,
+ serp_engine: Optional[str] = None,
+ ):
+ self.search_url = search_url
+ self.topk = topk
+ self.serp_api_key = serp_api_key
+ self.serp_engine = serp_engine
+
+
+# --- Online Search Wrapper ---
+class OnlineSearchEngine:
+ def __init__(self, config: OnlineSearchConfig):
+ self.config = config
+
+ def _search_query(self, query: str):
+ params = {
+ "engine": self.config.serp_engine,
+ "q": query,
+ "api_key": self.config.serp_api_key,
+ }
+ response = requests.get(self.config.search_url, params=params)
+ return response.json()
+
+ def batch_search(self, queries: List[str]):
+ results = []
+ with ThreadPoolExecutor() as executor:
+ for result in executor.map(self._search_query, queries):
+ results.append(self._process_result(result))
+ return results
+
+ def _process_result(self, search_result: Dict):
+ results = []
+
+ answer_box = search_result.get('answer_box', {})
+ if answer_box:
+ title = answer_box.get('title', 'No title.')
+ snippet = answer_box.get('snippet', 'No snippet available.')
+ results.append({
+ 'document': {"contents": f'\"{title}\"\n{snippet}'},
+ })
+
+ organic_results = search_result.get('organic_results', [])
+ for _, result in enumerate(organic_results[:self.config.topk]):
+ title = result.get('title', 'No title.')
+ snippet = result.get('snippet', 'No snippet available.')
+ results.append({
+ 'document': {"contents": f'\"{title}\"\n{snippet}'},
+ })
+
+ related_results = search_result.get('related_questions', [])
+ for _, result in enumerate(related_results[:self.config.topk]):
+ title = result.get('question', 'No title.') # question is the title here
+ snippet = result.get('snippet', 'No snippet available.')
+ results.append({
+ 'document': {"contents": f'\"{title}\"\n{snippet}'},
+ })
+
+ return results
+
+
+# --- FastAPI Setup ---
+app = FastAPI(title="Online Search Proxy Server")
+
+class SearchRequest(BaseModel):
+ queries: List[str]
+
+# Instantiate global config + engine
+config = OnlineSearchConfig(
+ search_url=args.search_url,
+ topk=args.topk,
+ serp_api_key=args.serp_api_key,
+ serp_engine=args.serp_engine,
+)
+engine = OnlineSearchEngine(config)
+
+# --- Routes ---
+@app.post("/retrieve")
+def search_endpoint(request: SearchRequest):
+ results = engine.batch_search(request.queries)
+ return {"result": results}
+
+## return {"result": List[List[{'document': {"id": xx, "content": "title" + \n + "content"}, 'score': xx}]]}
+
+if __name__ == "__main__":
+ # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/verl.egg-info/SOURCES.txt b/code/RL_model/verl/Search-R1/verl.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e9406e91a62af724dbb31fe4f07363c0b81bafc7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl.egg-info/SOURCES.txt
@@ -0,0 +1,190 @@
+LICENSE
+README.md
+pyproject.toml
+setup.py
+./search_r1/__init__.py
+./search_r1/llm_agent/__init__.py
+./search_r1/llm_agent/generation.py
+./search_r1/llm_agent/tensor_helper.py
+./verl/__init__.py
+./verl/protocol.py
+./verl/models/__init__.py
+./verl/models/registry.py
+./verl/models/weight_loader_registry.py
+./verl/models/llama/__init__.py
+./verl/models/llama/megatron/__init__.py
+./verl/models/llama/megatron/modeling_llama_megatron.py
+./verl/models/llama/megatron/checkpoint_utils/__init__.py
+./verl/models/llama/megatron/checkpoint_utils/llama_loader.py
+./verl/models/llama/megatron/checkpoint_utils/llama_saver.py
+./verl/models/llama/megatron/layers/__init__.py
+./verl/models/llama/megatron/layers/parallel_attention.py
+./verl/models/llama/megatron/layers/parallel_decoder.py
+./verl/models/llama/megatron/layers/parallel_linear.py
+./verl/models/llama/megatron/layers/parallel_mlp.py
+./verl/models/llama/megatron/layers/parallel_rmsnorm.py
+./verl/models/transformers/__init__.py
+./verl/models/transformers/llama.py
+./verl/models/transformers/monkey_patch.py
+./verl/models/transformers/qwen2.py
+./verl/single_controller/__init__.py
+./verl/single_controller/base/__init__.py
+./verl/single_controller/base/decorator.py
+./verl/single_controller/base/worker.py
+./verl/single_controller/base/worker_group.py
+./verl/single_controller/base/megatron/__init__.py
+./verl/single_controller/base/megatron/worker.py
+./verl/single_controller/base/megatron/worker_group.py
+./verl/single_controller/base/register_center/__init__.py
+./verl/single_controller/base/register_center/ray.py
+./verl/single_controller/ray/__init__.py
+./verl/single_controller/ray/base.py
+./verl/single_controller/ray/megatron.py
+./verl/third_party/__init__.py
+./verl/third_party/vllm/__init__.py
+./verl/third_party/vllm/vllm_v_0_3_1/__init__.py
+./verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
+./verl/third_party/vllm/vllm_v_0_3_1/config.py
+./verl/third_party/vllm/vllm_v_0_3_1/llm.py
+./verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
+./verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
+./verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
+./verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
+./verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
+./verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_3_1/worker.py
+./verl/third_party/vllm/vllm_v_0_4_2/__init__.py
+./verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
+./verl/third_party/vllm/vllm_v_0_4_2/config.py
+./verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
+./verl/third_party/vllm/vllm_v_0_4_2/llm.py
+./verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
+./verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
+./verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
+./verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
+./verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
+./verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
+./verl/third_party/vllm/vllm_v_0_4_2/worker.py
+./verl/third_party/vllm/vllm_v_0_5_4/__init__.py
+./verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py
+./verl/third_party/vllm/vllm_v_0_5_4/config.py
+./verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py
+./verl/third_party/vllm/vllm_v_0_5_4/llm.py
+./verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
+./verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_5_4/model_loader.py
+./verl/third_party/vllm/vllm_v_0_5_4/model_runner.py
+./verl/third_party/vllm/vllm_v_0_5_4/parallel_state.py
+./verl/third_party/vllm/vllm_v_0_5_4/spmd_gpu_executor.py
+./verl/third_party/vllm/vllm_v_0_5_4/tokenizer.py
+./verl/third_party/vllm/vllm_v_0_5_4/worker.py
+./verl/third_party/vllm/vllm_v_0_6_3/__init__.py
+./verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py
+./verl/third_party/vllm/vllm_v_0_6_3/config.py
+./verl/third_party/vllm/vllm_v_0_6_3/dtensor_weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_6_3/hf_weight_loader.py
+./verl/third_party/vllm/vllm_v_0_6_3/llm.py
+./verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py
+./verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py
+./verl/third_party/vllm/vllm_v_0_6_3/model_loader.py
+./verl/third_party/vllm/vllm_v_0_6_3/model_runner.py
+./verl/third_party/vllm/vllm_v_0_6_3/parallel_state.py
+./verl/third_party/vllm/vllm_v_0_6_3/spmd_gpu_executor.py
+./verl/third_party/vllm/vllm_v_0_6_3/tokenizer.py
+./verl/third_party/vllm/vllm_v_0_6_3/worker.py
+./verl/trainer/__init__.py
+./verl/trainer/fsdp_sft_trainer.py
+./verl/trainer/main_eval.py
+./verl/trainer/main_generation.py
+./verl/trainer/main_ppo.py
+./verl/trainer/main_ppo_format.py
+./verl/trainer/config/evaluation.yaml
+./verl/trainer/config/generation.yaml
+./verl/trainer/config/ppo_megatron_trainer.yaml
+./verl/trainer/config/ppo_trainer.yaml
+./verl/trainer/config/sft_trainer.yaml
+./verl/trainer/ppo/__init__.py
+./verl/trainer/ppo/core_algos.py
+./verl/trainer/ppo/ray_trainer.py
+./verl/utils/__init__.py
+./verl/utils/config.py
+./verl/utils/distributed.py
+./verl/utils/flops_counter.py
+./verl/utils/fs.py
+./verl/utils/fsdp_utils.py
+./verl/utils/hdfs_io.py
+./verl/utils/import_utils.py
+./verl/utils/logging_utils.py
+./verl/utils/megatron_utils.py
+./verl/utils/memory_buffer.py
+./verl/utils/model.py
+./verl/utils/py_functional.py
+./verl/utils/ray_utils.py
+./verl/utils/seqlen_balancing.py
+./verl/utils/tokenizer.py
+./verl/utils/torch_dtypes.py
+./verl/utils/torch_functional.py
+./verl/utils/tracking.py
+./verl/utils/ulysses.py
+./verl/utils/dataset/__init__.py
+./verl/utils/dataset/rl_dataset.py
+./verl/utils/dataset/rm_dataset.py
+./verl/utils/debug/__init__.py
+./verl/utils/debug/performance.py
+./verl/utils/debug/trajectory_tracker.py
+./verl/utils/logger/__init__.py
+./verl/utils/logger/aggregate_logger.py
+./verl/utils/megatron/__init__.py
+./verl/utils/megatron/memory.py
+./verl/utils/megatron/optimizer.py
+./verl/utils/megatron/optimizer_config.py
+./verl/utils/megatron/pipeline_parallel.py
+./verl/utils/megatron/sequence_parallel.py
+./verl/utils/megatron/tensor_parallel.py
+./verl/utils/rendezvous/__init__.py
+./verl/utils/rendezvous/ray_backend.py
+./verl/utils/reward_score/__init__.py
+./verl/utils/reward_score/countdown.py
+./verl/utils/reward_score/gsm8k.py
+./verl/utils/reward_score/math.py
+./verl/utils/reward_score/multiply.py
+./verl/utils/reward_score/qa_em.py
+./verl/utils/reward_score/qa_em_format.py
+./verl/version/version
+./verl/workers/__init__.py
+./verl/workers/fsdp_workers.py
+./verl/workers/megatron_workers.py
+./verl/workers/actor/__init__.py
+./verl/workers/actor/base.py
+./verl/workers/actor/dp_actor.py
+./verl/workers/actor/megatron_actor.py
+./verl/workers/critic/__init__.py
+./verl/workers/critic/base.py
+./verl/workers/critic/dp_critic.py
+./verl/workers/critic/megatron_critic.py
+./verl/workers/reward_model/__init__.py
+./verl/workers/reward_model/base.py
+./verl/workers/reward_model/megatron/__init__.py
+./verl/workers/reward_model/megatron/reward_model.py
+./verl/workers/rollout/__init__.py
+./verl/workers/rollout/base.py
+./verl/workers/rollout/hf_rollout.py
+./verl/workers/rollout/tokenizer.py
+./verl/workers/rollout/naive/__init__.py
+./verl/workers/rollout/naive/naive_rollout.py
+./verl/workers/rollout/vllm_rollout/__init__.py
+./verl/workers/rollout/vllm_rollout/vllm_rollout.py
+./verl/workers/sharding_manager/__init__.py
+./verl/workers/sharding_manager/base.py
+./verl/workers/sharding_manager/fsdp_ulysses.py
+./verl/workers/sharding_manager/fsdp_vllm.py
+./verl/workers/sharding_manager/megatron_vllm.py
+verl.egg-info/PKG-INFO
+verl.egg-info/SOURCES.txt
+verl.egg-info/dependency_links.txt
+verl.egg-info/requires.txt
+verl.egg-info/top_level.txt
+verl/version/version
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd850b790c7ef7ea88515b58e629cad45c0c84e2
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+
+with open(os.path.join(version_folder, 'version/version')) as f:
+ __version__ = f.read().strip()
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/__init__.py b/code/RL_model/verl/Search-R1/verl/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py b/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..018bdd8fdbe01dddda5da009694246021320ab44
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py
@@ -0,0 +1,69 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Offline evaluate the performance of a generated file using reward model and ground truth verifier.
+The input is a parquet file that contains N generated sequences and (optional) the ground truth.
+
+"""
+
+import hydra
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.reward_score import math, gsm8k
+import pandas as pd
+import numpy as np
+
+
+def select_reward_fn(data_source):
+ if data_source == 'lighteval/MATH':
+ return math.compute_score
+ else:
+ raise NotImplementedError
+
+
+@hydra.main(config_path='config', config_name='evaluation', version_base=None)
+def main(config):
+ local_path = copy_local_path_from_hdfs(config.data.path)
+ dataset = pd.read_parquet(local_path)
+ prompts = dataset[config.data.prompt_key]
+ responses = dataset[config.data.response_key]
+ data_sources = dataset[config.data.data_source_key]
+ reward_model_data = dataset[config.data.reward_model_key]
+
+ passes = 0
+
+ total = len(dataset)
+
+ for i in range(total):
+ response_lst = responses[i]
+ data_source = data_sources[i]
+ # select reward score based on data_source
+ prompt = prompts[i]
+ reward_data = reward_model_data[i]
+ reward_fn = select_reward_fn(data_source)
+ ground_truth = reward_data['ground_truth']
+ score_lst = []
+ for r in response_lst:
+ score = reward_fn(r, ground_truth)
+ score_lst.append(score)
+
+ max_score = np.max(score_lst)
+
+ if max_score == 1:
+ passes += 1
+
+ print(f'pass@5: {passes / total}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/code/RL_model/verl/Search-R1/verl/utils/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e453070a16370cd7006e0a7700c8550a56f19051
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import tokenizer
+from .tokenizer import *
+
+__all__ = tokenizer.__all__
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/config.py b/code/RL_model/verl/Search-R1/verl/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9298c42adf89467d047a3d0fdf8919bf772a5a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/config.py
@@ -0,0 +1,23 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from omegaconf import DictConfig
+
+
+def update_dict_with_config(dictionary: Dict, config: DictConfig):
+ for key in dictionary:
+ if hasattr(config, key):
+ dictionary[key] = getattr(config, key)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/distributed.py b/code/RL_model/verl/Search-R1/verl/utils/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fea5a29cd943ef91c8f27f44db2a69e40702cf7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/distributed.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for distributed training."""
+import os
+
+
+def initialize_global_process_group(timeout_second=36000):
+ import torch.distributed
+ from datetime import timedelta
+ torch.distributed.init_process_group('nccl', timeout=timedelta(seconds=timeout_second))
+ local_rank = int(os.environ["LOCAL_RANK"])
+ rank = int(os.environ["RANK"])
+ world_size = int(os.environ["WORLD_SIZE"])
+
+ if torch.distributed.is_initialized():
+ torch.cuda.set_device(local_rank)
+ return local_rank, rank, world_size
diff --git a/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py b/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c5ac1a91160fc3265589fb6e93e93c8c1efb53e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py
@@ -0,0 +1,123 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import PretrainedConfig, Qwen2Config, LlamaConfig
+
+VALID_CONFIG_TYPE = (Qwen2Config, LlamaConfig)
+
+
+def get_device_flops(unit="T"):
+
+ def unit_convert(number, level):
+ units = ["B", "K", "M", "G", "T", "P"]
+ if number <= 0:
+ return number
+ ptr = 0
+ while ptr < len(units) and units[ptr] != level:
+ number /= 1000
+ ptr += 1
+ return number
+
+ device_name = torch.cuda.get_device_name()
+ flops = float("inf") # INF flops for unkown gpu type
+ if "H100" in device_name or "H800" in device_name:
+ flops = 989e12
+ elif "A100" in device_name or "A800" in device_name:
+ flops = 312e12
+ elif "L40" in device_name:
+ flops = 181.05e12
+ elif "L20" in device_name:
+ flops = 119.5e12
+ elif "H20" in device_name:
+ flops = 148e12
+ elif "910B" in device_name:
+ flops = 354e12
+ flops_unit = unit_convert(flops, unit)
+ return flops_unit
+
+
+class FlopsCounter:
+ """
+ Used to count mfu during training loop
+
+ Example:
+ flops_counter = FlopsCounter(config)
+ flops_achieved, flops_promised = flops_counter.estimate_flops(tokens_list, delta_time)
+
+ """
+
+ def __init__(self, config: PretrainedConfig):
+ if not isinstance(config, VALID_CONFIG_TYPE):
+ print(f"Only support config type of {VALID_CONFIG_TYPE}, but got {type(config)}. "
+ f"MFU will always be zero.")
+
+ self.estimate_func = {"qwen2": self._estimate_qwen2_flops, 'llama': self._estimate_qwen2_flops}
+ self.config = config
+
+ def _estimate_unknown_flops(self, tokens_sum, batch_seqlens, delta_time):
+ return 0
+
+ def _estimate_qwen2_flops(self, tokens_sum, batch_seqlens, delta_time):
+ assert isinstance(self.config, (Qwen2Config, LlamaConfig))
+ hidden_size = self.config.hidden_size
+ vocab_size = self.config.vocab_size
+ num_hidden_layers = self.config.num_hidden_layers
+ num_key_value_heads = self.config.num_key_value_heads
+ num_attention_heads = self.config.num_attention_heads
+ intermediate_size = self.config.intermediate_size
+
+ head_dim = hidden_size // num_attention_heads
+ q_size = num_attention_heads * head_dim
+ k_size = num_key_value_heads * head_dim
+ v_size = num_key_value_heads * head_dim
+
+ # non-attn per layer parm
+ # Qwen2/LLama use SwiGelu, gate, having up and down linear layer in mlp
+ mlp_N = hidden_size * intermediate_size * 3
+ attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim)
+ emd_and_lm_head_N = vocab_size * hidden_size * 2
+ # non-attn all_layer parm
+ dense_N = (mlp_N + attn_linear_N) * num_hidden_layers + emd_and_lm_head_N
+ # non-attn all_layer & all_token fwd & bwd flops
+ dense_N_flops = 6 * dense_N * tokens_sum
+
+ # attn all_layer & all_token fwd & bwd flops
+ seqlen_square_sum = 0
+ for seqlen in batch_seqlens:
+ seqlen_square_sum += seqlen * seqlen
+ attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads * num_hidden_layers
+
+ # all_layer & all_token fwd & bwd flops
+ flops_all_token = dense_N_flops + attn_qkv_flops
+ flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
+ return flops_achieved
+
+ def estimate_flops(self, batch_seqlens, delta_time):
+ """
+ Estimate the FLOPS based on the number of valid tokens in the current batch and the time taken.
+
+ Args:
+ batch_seqlens (List[int]): A list where each element represents the number of valid tokens in the current batch.
+ delta_time (float): The time taken to process the batch, in seconds.
+
+ Returns:
+ estimated_flops (float): The estimated FLOPS based on the input tokens and time.
+ promised_flops (float): The expected FLOPS of the current device.
+ """
+ tokens_sum = sum(batch_seqlens)
+ func = self.estimate_func.get(self.config.model_type, self._estimate_unknown_flops)
+ estimated_flops = func(tokens_sum, batch_seqlens, delta_time)
+ promised_flops = get_device_flops()
+ return estimated_flops, promised_flops
diff --git a/code/RL_model/verl/Search-R1/verl/utils/fs.py b/code/RL_model/verl/Search-R1/verl/utils/fs.py
new file mode 100644
index 0000000000000000000000000000000000000000..80c1889be3582fffcdef5267f5e9ac55e1d7e059
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/fs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+"""File-system agnostic IO APIs"""
+import os
+import tempfile
+import hashlib
+
+from .hdfs_io import copy, makedirs, exists
+
+__all__ = ["copy", "exists", "makedirs"]
+
+_HDFS_PREFIX = "hdfs://"
+
+
+def _is_non_local(path):
+ return path.startswith(_HDFS_PREFIX)
+
+
+def md5_encode(path: str) -> str:
+ return hashlib.md5(path.encode()).hexdigest()
+
+
+def get_local_temp_path(hdfs_path: str, cache_dir: str) -> str:
+ """Return a local temp path that joins cache_dir and basename of hdfs_path
+
+ Args:
+ hdfs_path:
+ cache_dir:
+
+ Returns:
+
+ """
+ # make a base64 encoding of hdfs_path to avoid directory conflict
+ encoded_hdfs_path = md5_encode(hdfs_path)
+ temp_dir = os.path.join(cache_dir, encoded_hdfs_path)
+ os.makedirs(temp_dir, exist_ok=True)
+ dst = os.path.join(temp_dir, os.path.basename(hdfs_path))
+ return dst
+
+
+def copy_local_path_from_hdfs(src: str, cache_dir=None, filelock='.file.lock', verbose=False) -> str:
+ """Copy src from hdfs to local if src is on hdfs or directly return src.
+ If cache_dir is None, we will use the default cache dir of the system. Note that this may cause conflicts if
+ the src name is the same between calls
+
+ Args:
+ src (str): a HDFS path of a local path
+
+ Returns:
+ a local path of the copied file
+ """
+ from filelock import FileLock
+
+ assert src[-1] != '/', f'Make sure the last char in src is not / because it will cause error. Got {src}'
+
+ if _is_non_local(src):
+ # download from hdfs to local
+ if cache_dir is None:
+ # get a temp folder
+ cache_dir = tempfile.gettempdir()
+ os.makedirs(cache_dir, exist_ok=True)
+ assert os.path.exists(cache_dir)
+ local_path = get_local_temp_path(src, cache_dir)
+ # get a specific lock
+ filelock = md5_encode(src) + '.lock'
+ lock_file = os.path.join(cache_dir, filelock)
+ with FileLock(lock_file=lock_file):
+ if not os.path.exists(local_path):
+ if verbose:
+ print(f'Copy from {src} to {local_path}')
+ copy(src, local_path)
+ return local_path
+ else:
+ return src
diff --git a/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py b/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0243cd15c2d2defe8e54164c6e07a05c5f6232d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py
@@ -0,0 +1,329 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+import functools
+import json
+import math
+import itertools
+import os
+from contextlib import contextmanager
+from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
+from transformers.trainer_pt_utils import get_module_class_from_name
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+
+def init_fn(x: torch.nn.Module):
+ if not torch.distributed.get_rank() == 0:
+ x = x.to_empty(device=torch.cuda.current_device(), recurse=False)
+ torch.cuda.empty_cache()
+ return x
+
+
+def get_init_weight_context_manager(use_meta_tensor=True):
+ from accelerate import init_empty_weights
+ cpu_init_weights = lambda: torch.device('cpu')
+ if use_meta_tensor:
+ init_context = init_empty_weights if torch.distributed.get_rank() != 0 else cpu_init_weights
+ else:
+ init_context = cpu_init_weights
+ return init_context
+
+
+# Copyright 2020-present the HuggingFace Inc. team.
+# Adapted from https://github.com/huggingface/transformers/src/transformers/trainer.py
+def get_fsdp_wrap_policy(module, config=None, is_lora=False):
+ """Get FSDP wrap policy for the module.
+
+ Args:
+ module: The module to get wrap policy for
+ config: Configuration for wrap policy
+ is_lora: Whether to enable lambda policy for LoRA modules
+ """
+ if config is None:
+ config = {}
+
+ if config.get('disable', False):
+ return None
+
+ default_transformer_cls_names_to_wrap = getattr(module, "_no_split_modules", None)
+ fsdp_transformer_layer_cls_to_wrap = config.get("transformer_layer_cls_to_wrap",
+ default_transformer_cls_names_to_wrap)
+ min_num_params = config.get('min_num_params', 0)
+ auto_wrap_policy = None
+
+ policies = []
+
+ from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
+
+ # Add lambda policy for LoRA modules if is_lora is True
+ if is_lora:
+
+ def lambda_policy_fn(module):
+ if (len(list(module.named_children())) == 0 and getattr(module, "weight", None) is not None and
+ module.weight.requires_grad):
+ return True
+ return False
+
+ lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
+ policies.append(lambda_policy)
+
+ if min_num_params > 0:
+ size_policy = functools.partial(size_based_auto_wrap_policy, min_num_params=min_num_params)
+ policies.append(size_policy)
+ elif fsdp_transformer_layer_cls_to_wrap is not None:
+ transformer_cls_to_wrap = set()
+ for layer_class in fsdp_transformer_layer_cls_to_wrap:
+ transformer_cls = get_module_class_from_name(module, layer_class)
+ if transformer_cls is None:
+ raise Exception("Could not find the transformer layer class to wrap in the model.")
+ else:
+ transformer_cls_to_wrap.add(transformer_cls)
+
+ transformer_policy = functools.partial(
+ transformer_auto_wrap_policy,
+ transformer_layer_cls=transformer_cls_to_wrap,
+ )
+ policies.append(transformer_policy)
+
+ if len(policies) > 0:
+ auto_wrap_policy = functools.partial(_or_policy, policies=policies)
+
+ return auto_wrap_policy
+
+
+def offload_fsdp_grad(module):
+ for _, param in module.named_parameters():
+ if param.grad is not None:
+ param.grad = param.grad.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_fsdp_grad(module, device_id):
+ for _, param in module.named_parameters():
+ if param.grad is not None:
+ param.grad = param.grad.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def offload_fsdp_param_and_grad(module, offload_grad=False):
+ for _, param in module.named_parameters():
+ if hasattr(param, "_local_shard"):
+ param._local_shard = param._local_shard.to("cpu", non_blocking=True)
+ param.data = param.data.to('cpu', non_blocking=True)
+ if offload_grad and param.grad is not None:
+ param.grad = param.grad.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_fsdp_param_and_grad(module, device_id, load_grad=False):
+ for _, param in module.named_parameters():
+ if hasattr(param, "_local_shard"):
+ param._local_shard = param._local_shard.to(device_id, non_blocking=True)
+ param.data = param.data.to(device_id, non_blocking=True)
+ if load_grad and param.grad is not None:
+ param.grad = param.grad.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def offload_fsdp_optimizer(optimizer):
+ for param_group in optimizer.param_groups:
+ for param in param_group['params']:
+ state = optimizer.state[param]
+ for key, value in state.items():
+ if isinstance(value, torch.Tensor):
+ state[key] = value.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_fsdp_optimizer(optimizer, device_id):
+ for param_group in optimizer.param_groups:
+ for param in param_group['params']:
+ state = optimizer.state[param]
+ for key, value in state.items():
+ if isinstance(value, torch.Tensor):
+ state[key] = value.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+@contextmanager
+def meta_device_init():
+ """
+ Create model parameters with meta device.
+
+ Note buffers in model will still be initialized in default device (e.g., CPU),
+ since the buffers can be non-persistent and filled with expected values that can
+ NOT be captured in meta device.
+ """
+ device = torch.device("meta")
+ old_register_parameter = nn.Module.register_parameter
+ registered = set()
+
+ def register_empty_parameter(module, name, param):
+ old_register_parameter(module, name, param)
+ # we will skip register shared parameters as it
+ # is already registered previously
+ if param is not None and param not in registered:
+ param_cls = type(module._parameters[name])
+ kwargs = module._parameters[name].__dict__
+ kwargs["requires_grad"] = param.requires_grad
+ module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+ registered.add(module._parameters[name])
+
+ try:
+ nn.Module.register_parameter = register_empty_parameter
+ yield
+ finally:
+ registered.clear()
+ nn.Module.register_parameter = old_register_parameter
+
+
+def parallel_load_safetensors(filepath):
+ """
+ Parallel load safetensors from huggingface checkpoint
+
+ Huggingface checkpoint contains:
+
+ - config.json: a json file for model configuration
+ - model.safetensor.index.json: a json file for safetensors (parameters & buffers) index
+ - model-000x-of-ooxx.safetensors: a binary file for safetensors (parameters & buffers) chunks
+
+ Or (when model is small),
+
+ - model.safetensors: a binary file for all parameters and buffers
+
+ Each rank will own a part of model chunks and load them directly into GPU memory.
+ """
+ from safetensors.torch import load_file
+
+ safetensors2param = {}
+
+ index_file = os.path.join(filepath, "model.safetensors.index.json")
+ if os.path.exists(index_file):
+ index = json.load(open(index_file, "rb"))
+ for param_name, filename in index["weight_map"].items():
+ safetensors2param.setdefault(filename, []).append(param_name)
+ else:
+ # in this case, the model is small and we can load it all at once
+ param_file = os.path.join(filepath, "model.safetensors")
+ assert os.path.exists(param_file), f"Cannot find {param_file}"
+ states = load_file(param_file)
+ for param_name in states:
+ safetensors2param.setdefault("model.safetensors", []).append(param_name)
+ del states
+
+ total_files = len(safetensors2param)
+ ckpt_chunks = sorted(safetensors2param.keys())
+ world_size = dist.get_world_size()
+ size = int(math.ceil(total_files / world_size))
+ ckpt_chunks = [ckpt_chunks[rank * size:rank * size + size] for rank in range(world_size)]
+
+ shard_states = {}
+ device = torch.cuda.current_device()
+ for rank, files in enumerate(ckpt_chunks):
+ if rank == dist.get_rank():
+ for file in files:
+ file = os.path.join(filepath, file)
+ states = load_file(file, device=device)
+ # print(f"rank {rank} loading {file}...")
+ shard_states.update(states)
+ else:
+ for file in files:
+ for param_name in safetensors2param[file]:
+ shard_states[param_name] = rank
+ return shard_states
+
+
+def parallel_init_module_fn(module: torch.nn.Module, shard_states: Dict[str, torch.nn.Parameter]):
+ """
+ Generate a function to initialize sub-modules in the `module` with `shard_states`
+ from huggingface checkpoint.
+
+ Args:
+ module (torch.nn.Module): the global module to be initialized
+ shard_states (Dict[str, torch.nn.Parameter]): the shard states from huggingface checkpoint
+
+ Returns:
+ init_fn (Callable): a function to initialize sub-modules in the `module` with `shard_states`
+ """
+
+ state2fqn = {}
+ for name, state in itertools.chain(module.named_parameters(remove_duplicate=False),
+ module.named_buffers(remove_duplicate=False)):
+ state2fqn.setdefault(state, []).append(name)
+ # remove standalone parameters and buffers
+ shared = {s for s, names in state2fqn.items() if len(names) > 1}
+ materialized_states = {}
+
+ @torch.no_grad()
+ def create_and_sync_state(param_name, state, is_param):
+ assert param_name in shard_states, f"{param_name} not loaded"
+ device = torch.cuda.current_device()
+ if is_param:
+ param = torch.nn.Parameter(torch.empty_like(state.data, device=device), requires_grad=state.requires_grad)
+ else: # buffer
+ param = torch.empty_like(state.data, device=device)
+ loaded = shard_states[param_name]
+ if isinstance(loaded, (torch.nn.Parameter, torch.Tensor)):
+ # NOTE: loaded.dtype can be different with param.dtype
+ param.data.copy_(loaded.data)
+ dist.broadcast(param.data, src=dist.get_rank())
+ else:
+ assert isinstance(loaded, int) # the rank that holds the state
+ dist.broadcast(param.data, src=loaded)
+ shard_states.pop(param_name)
+ del loaded
+ return param
+
+ def init_fn(sub_mod: torch.nn.Module, recurse: bool = True):
+ param_and_buffers = tuple(sub_mod.named_parameters(recurse=False)) + tuple(sub_mod.named_buffers(recurse=False))
+ # param_and_buffers = sorted(sub_mod.named_parameters(recurse=False), key=lambda x: x[0])
+ for name, state in param_and_buffers:
+ if not state.is_meta:
+ continue
+ is_param = name in sub_mod._parameters
+ fqn = state2fqn[state].pop(0)
+ # non-persistent buffers will not be saved in state dict, we can safely skip it
+ if (not is_param) and fqn not in shard_states:
+ if state.is_meta:
+ raise RuntimeError(
+ f"find a non-persistent buffer ({fqn}) initiated with device meta. "
+ "Such buffer is not saved in checkpoint and user should guarantee to init in CPU / GPU device.")
+ continue
+ # for shared parameter, we get it from the first time it is created
+ if state in shared:
+ if state not in materialized_states:
+ materialized_states[state] = create_and_sync_state(fqn, state, is_param)
+ else:
+ if fqn in shard_states:
+ shard_states.pop(fqn)
+ materialize_state = materialized_states[state]
+ # for not shared parameter, we create it directly
+ else:
+ materialize_state = create_and_sync_state(fqn, state, is_param)
+ if is_param:
+ sub_mod._parameters[name] = materialize_state
+ else:
+ sub_mod._buffers[name] = materialize_state
+ if recurse:
+ for module in sub_mod.children():
+ init_fn(module, recurse=True)
+
+ # for debug
+ # if len(shard_states) == 0: print("clear")
+ return sub_mod
+
+ return init_fn
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py b/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c4ecb9a5956865ce35651d6eaaf6844ba87f41
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py
@@ -0,0 +1,144 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import logging
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN'))
+
+_HDFS_PREFIX = "hdfs://"
+
+_HDFS_BIN_PATH = shutil.which('hdfs')
+
+
+def exists(path: str, **kwargs) -> bool:
+ r"""Works like os.path.exists() but supports hdfs.
+
+ Test whether a path exists. Returns False for broken symbolic links.
+
+ Args:
+ path (str): path to test
+
+ Returns:
+ bool: True if the path exists, False otherwise
+ """
+ if _is_non_local(path):
+ return _exists(path, **kwargs)
+ return os.path.exists(path)
+
+
+def _exists(file_path: str):
+ """ hdfs capable to check whether a file_path is exists """
+ if file_path.startswith("hdfs"):
+ return _run_cmd(_hdfs_cmd(f"-test -e {file_path}")) == 0
+ return os.path.exists(file_path)
+
+
+def makedirs(name, mode=0o777, exist_ok=False, **kwargs) -> None:
+ r"""Works like os.makedirs() but supports hdfs.
+
+ Super-mkdir; create a leaf directory and all intermediate ones. Works like
+ mkdir, except that any intermediate path segment (not just the rightmost)
+ will be created if it does not exist. If the target directory already
+ exists, raise an OSError if exist_ok is False. Otherwise no exception is
+ raised. This is recursive.
+
+ Args:
+ name (str): directory to create
+ mode (int): file mode bits
+ exist_ok (bool): if True, do not raise an exception if the directory already exists
+ kwargs: keyword arguments for hdfs
+
+ """
+ if _is_non_local(name):
+ # TODO(haibin.lin):
+ # - handle OSError for hdfs(?)
+ # - support exist_ok for hdfs(?)
+ _mkdir(name, **kwargs)
+ else:
+ os.makedirs(name, mode=mode, exist_ok=exist_ok)
+
+
+def _mkdir(file_path: str) -> bool:
+ """hdfs mkdir"""
+ if file_path.startswith("hdfs"):
+ _run_cmd(_hdfs_cmd(f"-mkdir -p {file_path}"))
+ else:
+ os.makedirs(file_path, exist_ok=True)
+ return True
+
+
+def copy(src: str, dst: str, **kwargs) -> bool:
+ r"""Works like shutil.copy() for file, and shutil.copytree for dir, and supports hdfs.
+
+ Copy data and mode bits ("cp src dst"). Return the file's destination.
+ The destination may be a directory.
+ If source and destination are the same file, a SameFileError will be
+ raised.
+
+ Arg:
+ src (str): source file path
+ dst (str): destination file path
+ kwargs: keyword arguments for hdfs copy
+
+ Returns:
+ str: destination file path
+
+ """
+ if _is_non_local(src) or _is_non_local(dst):
+ # TODO(haibin.lin):
+ # - handle SameFileError for hdfs files(?)
+ # - return file destination for hdfs files
+ return _copy(src, dst)
+ else:
+ if os.path.isdir(src):
+ return shutil.copytree(src, dst, **kwargs)
+ else:
+ return shutil.copy(src, dst, **kwargs)
+
+
+def _copy(from_path: str, to_path: str, timeout: int = None) -> bool:
+ if to_path.startswith("hdfs"):
+ if from_path.startswith("hdfs"):
+ returncode = _run_cmd(_hdfs_cmd(f"-cp -f {from_path} {to_path}"), timeout=timeout)
+ else:
+ returncode = _run_cmd(_hdfs_cmd(f"-put -f {from_path} {to_path}"), timeout=timeout)
+ else:
+ if from_path.startswith("hdfs"):
+ returncode = _run_cmd(_hdfs_cmd(f"-get \
+ {from_path} {to_path}"), timeout=timeout)
+ else:
+ try:
+ shutil.copy(from_path, to_path)
+ returncode = 0
+ except shutil.SameFileError:
+ returncode = 0
+ except Exception as e:
+ logger.warning(f"copy {from_path} {to_path} failed: {e}")
+ returncode = -1
+ return returncode == 0
+
+
+def _run_cmd(cmd: str, timeout=None):
+ return os.system(cmd)
+
+
+def _hdfs_cmd(cmd: str) -> str:
+ return f"{_HDFS_BIN_PATH} dfs {cmd}"
+
+
+def _is_non_local(path: str):
+ return path.startswith(_HDFS_PREFIX)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/import_utils.py b/code/RL_model/verl/Search-R1/verl/utils/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5690512d144a30d2a1f0bd128a40eb8876936b7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/import_utils.py
@@ -0,0 +1,48 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities to check if packages are available.
+We assume package availability won't change during runtime.
+"""
+
+from functools import cache
+from typing import List
+
+
+@cache
+def is_megatron_core_available():
+ try:
+ from megatron.core import parallel_state as mpu
+ return True
+ except ImportError:
+ return False
+
+
+@cache
+def is_vllm_available():
+ try:
+ import vllm
+ return True
+ except ImportError:
+ return False
+
+
+def import_external_libs(external_libs=None):
+ if external_libs is None:
+ return
+ if not isinstance(external_libs, List):
+ external_libs = [external_libs]
+ import importlib
+ for external_lib in external_libs:
+ importlib.import_module(external_lib)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py b/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf6e1f0fa70784edb6a7e6efecdba07f0c399b3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py
@@ -0,0 +1,22 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+
+def set_basic_config(level):
+ """
+ This function sets the global logging format and level. It will be called when import verl
+ """
+ logging.basicConfig(format='%(levelname)s:%(asctime)s:%(message)s', level=level)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py b/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb6b65a79ea302e3f7eaccd5145e29adbb9edd6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py
@@ -0,0 +1,253 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pretrain utilities."""
+from typing import Any, Dict
+import time
+from omegaconf import DictConfig
+from verl.utils.torch_dtypes import PrecisionType
+from verl.utils.memory_buffer import build_memory_reference_from_module
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from megatron.core import mpu, tensor_parallel
+from megatron.core.utils import get_model_config
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.module import Float16Module
+# from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.distributed import DistributedDataParallel as DDP
+from megatron.core.enums import ModelType
+
+
+def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
+ """Build the model."""
+ # Build model.
+ if mpu.get_pipeline_model_parallel_world_size() > 1 and \
+ mpu.get_virtual_pipeline_model_parallel_world_size() is not None:
+ assert model_type != ModelType.encoder_and_decoder, \
+ "Interleaved schedule not supported for model with both encoder and decoder"
+ model = []
+ for i in range(mpu.get_virtual_pipeline_model_parallel_world_size()):
+ mpu.set_virtual_pipeline_model_parallel_rank(i)
+ # Set pre_process and post_process only after virtual rank is set.
+ pre_process = mpu.is_pipeline_first_stage()
+ post_process = mpu.is_pipeline_last_stage()
+ this_model = model_provider_func(pre_process=pre_process, post_process=post_process)
+ this_model.model_type = model_type
+ model.append(this_model)
+ else:
+ pre_process = mpu.is_pipeline_first_stage()
+ post_process = mpu.is_pipeline_last_stage()
+ add_encoder = True
+ add_decoder = True
+ if model_type == ModelType.encoder_and_decoder:
+ if mpu.get_pipeline_model_parallel_world_size() > 1:
+ assert mpu.get_pipeline_model_parallel_split_rank() is not None, \
+ "Split rank needs to be specified for model with both encoder and decoder"
+ rank = mpu.get_pipeline_model_parallel_rank()
+ split_rank = mpu.get_pipeline_model_parallel_split_rank()
+ world_size = mpu.get_pipeline_model_parallel_world_size()
+ pre_process = rank == 0 or rank == split_rank
+ post_process = (rank == (split_rank - 1)) or (rank == (world_size - 1))
+ add_encoder = mpu.is_pipeline_stage_before_split()
+ add_decoder = mpu.is_pipeline_stage_after_split()
+ model = model_provider_func(pre_process=pre_process,
+ post_process=post_process,
+ add_encoder=add_encoder,
+ add_decoder=add_decoder)
+ else:
+ model = model_provider_func(pre_process=pre_process, post_process=post_process)
+ model.model_type = model_type
+
+ if not isinstance(model, list):
+ model = [model]
+
+ # Set tensor model parallel attributes if not set.
+ # Only parameters that are already tensor model parallel have these
+ # attributes set for them. We should make sure the default attributes
+ # are set for all params so the optimizer can use them.
+ for model_module in model:
+ for param in model_module.parameters():
+ tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+ # Print number of parameters.
+ if mpu.get_data_parallel_rank() == 0:
+ print(' > number of parameters on (tensor, pipeline) '
+ 'model parallel rank ({}, {}): {}'.format(
+ mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(),
+ sum([sum([p.nelement() for p in model_module.parameters()]) for model_module in model])),
+ flush=True)
+
+ # GPU allocation.
+ for model_module in model:
+ model_module.cuda(torch.cuda.current_device())
+
+ # Fp16 conversion.
+ config = get_model_config(model[0])
+ if config.fp16 or config.bf16: # the ModelParallelConfig in GPTModel
+ model = [Float16Module(config, model_module) for model_module in model]
+
+ if wrap_with_ddp:
+ model = [
+ DDP(config=config,
+ module=model_chunk,
+ data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
+ accumulate_allreduce_grads_in_fp32=True,
+ overlap_grad_reduce=False,
+ use_distributed_optimizer=True,
+ disable_bucketing=(model_chunk_idx > 0)) for (model_chunk_idx, model_chunk) in enumerate(model)
+ ]
+ # # Broadcast params from data parallel src rank to other data parallel ranks.
+ # if args.data_parallel_random_init:
+ for model_module in model:
+ model_module.broadcast_params()
+ return model
+
+
+ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module)
+
+
+def unwrap_model(model, module_instances=ALL_MODULE_WRAPPER_CLASSNAMES):
+ return_list = True
+ if not isinstance(model, list):
+ model = [model]
+ return_list = False
+ unwrapped_model = []
+ for model_module in model:
+ while isinstance(model_module, module_instances):
+ model_module = model_module.module
+ unwrapped_model.append(model_module)
+ if not return_list:
+ return unwrapped_model[0]
+ return unwrapped_model
+
+
+from transformers import PretrainedConfig
+
+
+def convert_config(hf_config: PretrainedConfig, megatron_config) -> TransformerConfig:
+ print(f'megatron config {megatron_config}')
+ dt = PrecisionType.to_dtype(megatron_config['param_dtype'])
+ print(f'pipeline_dtype=megatron_config {dt}')
+ transformer_config = TransformerConfig(
+ num_layers=hf_config.num_hidden_layers,
+ hidden_size=hf_config.hidden_size,
+ num_attention_heads=hf_config.num_attention_heads,
+ num_query_groups=hf_config.num_key_value_heads,
+ ffn_hidden_size=hf_config.intermediate_size,
+ # max_position_embeddings=hf_config.max_position_embeddings,
+ activation_func=F.silu,
+ normalization='RMSNorm',
+ # rotary_percent=False, # default,
+ gated_linear_unit=True, # for llama
+ use_cpu_initialization=True,
+ apply_residual_connection_post_layernorm=False, # check what's this mean
+ add_bias_linear=False,
+ tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
+ pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
+ virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
+ pipeline_dtype=PrecisionType.to_dtype(megatron_config['param_dtype']),
+ params_dtype=PrecisionType.to_dtype(megatron_config['param_dtype']),
+ sequence_parallel=megatron_config['sequence_parallel_enabled'],
+ variable_seq_lengths=True,
+ masked_softmax_fusion=True,
+ bf16=PrecisionType.to_dtype(megatron_config['param_dtype']) is torch.bfloat16)
+ if torch.distributed.get_rank() == 0:
+ print(f'tensor_parallel_size={transformer_config.tensor_model_parallel_size} \n \
+ pipeline_model_parallel_size={transformer_config.pipeline_model_parallel_size} \n \
+ virtual_pipeline_model_parallel_size={transformer_config.virtual_pipeline_model_parallel_size} \n \
+ pipeline_dtype={transformer_config.pipeline_dtype} \n \
+ params_dtype={transformer_config.params_dtype} \n \
+ sequence_parallel={transformer_config.sequence_parallel} \n \
+ variable_seq_lengths={transformer_config.variable_seq_lengths} \n \
+ masked_softmax_fusion={transformer_config.masked_softmax_fusion} \n ')
+
+ return transformer_config
+
+
+# from megatron.core.optimizer import OptimizerConfig
+
+from verl.utils.megatron.optimizer_config import OptimizerConfig
+
+
+def init_megatron_optim_config(optim_config: Dict) -> OptimizerConfig:
+ config = OptimizerConfig(
+ optimizer='adam',
+ lr=optim_config.get('lr'),
+ clip_grad=optim_config.get('clip_grad'),
+ weight_decay=1e-2,
+ bf16=True,
+ params_dtype=torch.bfloat16,
+ use_distributed_optimizer=True,
+ )
+ return config
+
+
+from megatron.core import ModelParallelConfig
+
+
+def init_model_parallel_config(config: DictConfig) -> ModelParallelConfig:
+ # TODO(sgm): check how to disable megatron timers
+ timers = FakeTimers()
+ return ModelParallelConfig(tensor_model_parallel_size=config.get('tensor_model_parallel_size'),
+ pipeline_model_parallel_size=config.get('pipeline_model_parallel_size'),
+ virtual_pipeline_model_parallel_size=config.get('virtual_pipeline_model_parallel_size'),
+ sequence_parallel=config.get('sequence_parallel'),
+ params_dtype=PrecisionType.to_dtype(config.get('param_dtype')),
+ pipeline_dtype=PrecisionType.to_dtype(config.get('param_dtype')),
+ bf16=True,
+ fp16=False,
+ timers=timers)
+
+
+class FakeTimers:
+ """Disable All Megatron Timing with FakeTimers"""
+
+ def __init__(self):
+ from megatron.timers import DummyTimer
+ self.dummy_timer = DummyTimer()
+
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
+ return self.dummy_timer
+
+
+def offload_megatron_param_and_grad(module_list: nn.ModuleList, offload_grad=False, hybrid_engine=None):
+ if hybrid_engine is not None:
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ for buffer in hybrid_engine.memory_buffers[pp_rank].values():
+ buffer.data = buffer.data.to('cpu', non_blocking=True)
+ build_memory_reference_from_module(module_list, hybrid_engine.memory_buffers[pp_rank], maintain_weight=True)
+ else:
+ for module in module_list:
+ for _, param in module.named_parameters():
+ param.data = param.data.to('cpu', non_blocking=True)
+ if offload_grad and param.grad is not None:
+ param.grad = param.grad.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_megatron_param_and_grad(module_list: nn.ModuleList, device_id, load_grad=False, hybrid_engine=None):
+ if hybrid_engine is not None:
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ for buffer in hybrid_engine.memory_buffers[pp_rank].values():
+ buffer.data = buffer.data.to(device_id, non_blocking=True)
+ build_memory_reference_from_module(module_list, hybrid_engine.memory_buffers[pp_rank], maintain_weight=True)
+ else:
+ for module in module_list:
+ for _, param in module.named_parameters():
+ param.data = param.data.to(device_id, non_blocking=True)
+ if load_grad and param.grad is not None:
+ param.grad = param.grad.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
diff --git a/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py b/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e07e42f7bc4648d3376dba404ae122e07ccb0d0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py
@@ -0,0 +1,214 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains utilities to manipulate torch memory buffers
+"""
+
+from typing import Dict, List
+
+import torch
+from torch import nn
+
+
+class MemoryBuffer:
+ """
+ A memory buffer is a contiguous torch tensor that may combine multiple tensors sharing with the underlying
+ memory. It must have a unique type to support this behavior.
+ """
+
+ def __init__(self, numel: int, numel_padded: int, dtype: torch.dtype):
+ self.numel = numel
+ self.numel_padded = numel_padded
+ self.dtype = dtype
+ self.data = torch.zeros(self.numel_padded, dtype=self.dtype, device='cuda', requires_grad=False)
+
+ def zero(self):
+ """Reset the buffer to zero."""
+ self.data.zero_()
+
+ def get(self, shape, start_index):
+ """Return a tensor with the input `shape` as a view into the
+ 1-D data starting at `start_index`."""
+ end_index = start_index + shape.numel()
+ assert end_index <= self.numel, \
+ 'requested tensor is out of the buffer range.'
+ buffer_tensor = self.data[start_index:end_index]
+ buffer_tensor = buffer_tensor.view(shape)
+ return buffer_tensor
+
+
+def calc_padded_numel(shape: torch.Size, dtype: torch.dtype):
+ """for cuda memory alignment, make sure alignment by 128-bits"""
+ align_numel = 128 // torch.finfo(dtype).bits
+ numel = shape.numel()
+ return (numel + align_numel - 1) // align_numel * align_numel
+
+
+def get_weight_buffer_meta_from_module(module: nn.Module) -> Dict[str, Dict]:
+ """
+ Return a dictionary containing name to a shape and dtype.
+ """
+ weight_buffer_meta = {}
+ for name, param in sorted(module.named_parameters()):
+ weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype}
+ return weight_buffer_meta
+
+
+def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]:
+ """Build the memory buffer given weight_buffer_meta
+
+ Args:
+ weight_buffer_meta: contains mapping from name to a dictionary containing shape and dtype of the tensors
+
+ Returns: a large memory buffer for each dtype that can hold all the tensors
+
+ """
+ memory_buffers = {}
+ total_numel_map = {} # map from dtype to the total numel
+ for name, meta_info in sorted(weight_buffer_meta.items()):
+ shape = meta_info['shape']
+ dtype = meta_info['dtype']
+
+ assert isinstance(shape, torch.Size)
+ assert isinstance(dtype, torch.dtype)
+
+ if dtype not in total_numel_map:
+ total_numel_map[dtype] = 0
+
+ total_numel_map[dtype] += calc_padded_numel(shape, dtype)
+
+ for dtype, total_numel in total_numel_map.items():
+ memory_buffers[dtype] = MemoryBuffer(total_numel, total_numel, dtype)
+
+ return memory_buffers
+
+
+def build_memory_reference_from_module(module: torch.nn.Module,
+ memory_buffers: Dict[torch.dtype, MemoryBuffer],
+ maintain_weight=True):
+ start_index = {}
+ for dtype in memory_buffers.keys():
+ start_index[dtype] = 0
+ for name, param in sorted(module.named_parameters()):
+ memory_buffer = memory_buffers[param.dtype]
+ buffer = memory_buffer.get(shape=param.shape, start_index=start_index[param.dtype])
+ # need to increment start_index
+ start_index[param.dtype] += calc_padded_numel(param.shape, dtype)
+ if maintain_weight:
+ buffer.copy_(param.data)
+ param.data = buffer
+
+
+def build_memory_reference(weight_buffer_meta: Dict[str, Dict], memory_buffers: Dict[torch.dtype, MemoryBuffer]):
+ """Build the memory references. The memory buffers are built using the build_memory_buffer API.
+ This API will allocate a weight buffer pointer to the memory buffer according to the weight_buffer_meta.
+
+ Args:
+ weight_buffer_meta:
+ memory_buffers:
+
+ Returns:
+
+ """
+ start_idx = {}
+ weight_buffers = {}
+ for dtype in memory_buffers.keys():
+ start_idx[dtype] = 0
+
+ for name, meta_info in sorted(weight_buffer_meta.items()):
+ shape = meta_info['shape']
+ dtype = meta_info['dtype']
+
+ buffer = memory_buffers[dtype].get(shape, start_index=start_idx[dtype])
+ start_idx[dtype] += calc_padded_numel(shape, dtype)
+ weight_buffers[name] = buffer
+
+ return weight_buffers
+
+
+class MemoryBufferModuleWrapper:
+ """
+ Note that we do not design MemoryBufferModuleWrapper as an nn.Module due to
+ - It will change the checkpoint name
+ """
+
+ def __init__(self, module: nn.Module):
+ super().__init__()
+ self.module = module
+ self.weight_buffer_meta = get_weight_buffer_meta_from_module(self.module)
+ self.memory_buffers = build_memory_buffer(self.weight_buffer_meta)
+ build_memory_reference_from_module(self.module, self.memory_buffers)
+
+ def get_memory_buffers(self):
+ return self.memory_buffers
+
+ def get_weight_buffer_meta(self):
+ return self.weight_buffer_meta
+
+
+class MegatronMemoryBufferForRollout(object):
+ """
+ We assume that
+ - inference engine has tp + dp
+ - actor has tp + pp + dp
+ - the tp between inference engine and actor should be the same
+ - memory_buffers: contains a list of memory_buffers, each is a dict from dtype to MemoryBuffer
+ - weight_buffers: contains a list of weight_buffers, each is a dict from name to param
+ - named_parameters: a dict from name to parameter that normalizes the names from pp and vpp. Note that
+ the named_parameters may not be directly compatible with inference engine. User has to take care of
+ this part such as the layout mismatches. (e.g. qkv transpose)
+ - Note that weight_buffer, named_parameters and memory_buffers share the same underlying GPU memory.
+ - When doing weight sync, the data is transfer via memory buffers
+ """
+
+ def __init__(self, transform_memory_param_fn):
+ self._memory_buffers = []
+ self._weight_buffers = []
+ self._named_parameters = {}
+ self.transform_memory_param_fn = transform_memory_param_fn
+
+ def initialize_weight_buffer(self, weight_buffer_meta_pp: List[Dict[str, Dict]]):
+ """
+ Initialize the weight buffer. The weight buffer is obtained according to the actor. We will construct
+ a large buffer for each dtype in the weight_buffer.
+
+ Args:
+ weight_buffer_meta: contains pp models, each pp models contains a dictionary of mapping from
+
+ Returns: None
+
+ """
+ self.weight_buffer_meta_pp = weight_buffer_meta_pp
+
+ for weight_buffer_meta in self.weight_buffer_meta_pp:
+ memory_buffer = build_memory_buffer(weight_buffer_meta)
+ self._memory_buffers.append(memory_buffer)
+ self._weight_buffers.append(None)
+
+ def build_memory_reference(self):
+ for i, weight_buffer_meta in enumerate(self.weight_buffer_meta_pp):
+ self._weight_buffers[i] = build_memory_reference(weight_buffer_meta, self._memory_buffers[i])
+ self._named_parameters = self.transform_memory_param_fn(self._weight_buffers)
+
+ @property
+ def named_parameters(self):
+ return self._named_parameters
+
+ @property
+ def weight_buffers(self):
+ return self._weight_buffers
+
+ @property
+ def memory_buffers(self):
+ return self._memory_buffers
diff --git a/code/RL_model/verl/Search-R1/verl/utils/model.py b/code/RL_model/verl/Search-R1/verl/utils/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9002451a1dce34b8c844f907ee6ac487351b5314
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/model.py
@@ -0,0 +1,332 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities to create common models from huggingface
+"""
+import os
+import warnings
+from typing import Dict, Type
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, MistralForSequenceClassification
+from verl.models.registry import ModelRegistry
+
+
+class LambdaLayer(nn.Module):
+
+ def __init__(self, fn):
+ super().__init__()
+ self.fn = fn
+
+ def forward(self, *args, **kwargs):
+ return self.fn(*args, **kwargs)
+
+
+def squeeze(x):
+ return torch.squeeze(x, dim=-1)
+
+
+def update_model_config(module_config, override_config_kwargs):
+ for key, val in override_config_kwargs.items():
+ setattr(module_config, key, val)
+
+
+def get_huggingface_actor_config(model_name: str, override_config_kwargs=None, trust_remote_code=False) -> Dict:
+ if override_config_kwargs is None:
+ override_config_kwargs = {}
+ assert isinstance(override_config_kwargs, Dict), \
+ f'override_config_kwargs must be a dict, got {type(override_config_kwargs)}'
+ module_config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+ update_model_config(module_config, override_config_kwargs)
+
+ return module_config
+
+
+def create_huggingface_actor(model_name: str, override_config_kwargs=None, automodel_kwargs=None) -> nn.Module:
+ """
+
+ Args:
+ model_name:
+ actor_override_config_kwargs:
+
+ Returns:
+
+ """
+ if override_config_kwargs is None:
+ override_config_kwargs = {}
+ if automodel_kwargs is None:
+ automodel_kwargs = {}
+ assert isinstance(override_config_kwargs, Dict), \
+ f'override_config_kwargs must be a dict, got {type(override_config_kwargs)}'
+ module_config = get_huggingface_actor_config(model_name,
+ override_config_kwargs,
+ trust_remote_code=automodel_kwargs.get('trust_remote_code', False))
+ module: nn.Module = AutoModelForCausalLM.from_config(module_config, **automodel_kwargs)
+ return module
+
+
+def create_huggingface_critic(model_name: str, override_config_kwargs=None, automodel_kwargs=None) -> nn.Module:
+ """
+
+ Args:
+ model_name:
+ override_config_kwargs:
+
+ Returns:
+
+ """
+ critic_module: nn.Module = create_huggingface_actor(model_name,
+ override_config_kwargs=override_config_kwargs,
+ automodel_kwargs=automodel_kwargs)
+ if automodel_kwargs is None:
+ automodel_kwargs = {}
+ torch_dtype = automodel_kwargs.get('torch_dtype', torch.float32)
+ critic_module.lm_head = nn.Sequential(nn.Linear(critic_module.config.hidden_size, 1, dtype=torch_dtype),
+ LambdaLayer(fn=squeeze))
+ return critic_module
+
+
+def get_model_size(model: nn.Module, scale='auto'):
+ n_params = sum(p.numel() for p in model.parameters())
+
+ if scale == 'auto':
+ if n_params > 1e9:
+ scale = 'B'
+ elif n_params > 1e6:
+ scale = 'M'
+ elif n_params > 1e3:
+ scale = 'K'
+ else:
+ scale = ''
+
+ if scale == 'B':
+ n_params = n_params / 1e9
+ elif scale == 'M':
+ n_params = n_params / 1e6
+ elif scale == 'K':
+ n_params = n_params / 1e3
+ elif scale == '':
+ pass
+ else:
+ raise NotImplemented(f'Unknown scale {scale}')
+
+ return n_params, scale
+
+
+def print_model_size(model: nn.Module, name: str = None):
+ n_params, scale = get_model_size(model, scale='auto')
+ if name is None:
+ name = model.__class__.__name__
+ print(f'{name} contains {n_params:.2f}{scale} parameters')
+
+
+def create_random_mask(input_ids: torch.Tensor,
+ max_ratio_of_valid_token: float,
+ max_ratio_of_left_padding: float,
+ min_ratio_of_valid_token: float = 0):
+ """Create a random mask given input_ids. Support left padding and right padding.
+ Process:
+ - Sample valid token length
+ - Sample left_padding length
+ - Generate padding
+
+ Args:
+ input_ids:
+ shape (batch_size, seq_len)
+
+ Returns:
+
+ """
+ assert max_ratio_of_valid_token > 0 and max_ratio_of_valid_token <= 1.
+ assert max_ratio_of_left_padding >= 0 and max_ratio_of_left_padding < 1.
+ assert min_ratio_of_valid_token <= max_ratio_of_valid_token
+
+ batch_size, sequence_length = input_ids.shape
+ max_num_valid_tokens = int(sequence_length * max_ratio_of_valid_token)
+ min_num_valid_tokens = max(1, int(sequence_length * min_ratio_of_valid_token))
+ max_left_padding = int(sequence_length * max_ratio_of_left_padding)
+ assert max_num_valid_tokens + max_left_padding <= sequence_length
+ assert max_num_valid_tokens > 0 and max_ratio_of_valid_token <= sequence_length
+ masks = torch.ones_like(input_ids, dtype=torch.int64)
+ # TODO: we can make this faster
+ for i in range(batch_size):
+ num_left_padding = np.random.randint(low=0, high=max_left_padding + 1, dtype=np.int64)
+ num_valid = np.random.randint(low=min_num_valid_tokens, high=max_num_valid_tokens + 1, dtype=np.int64)
+
+ for index in range(num_left_padding):
+ masks[i, index] = 0
+
+ for index in range(num_left_padding + num_valid, sequence_length):
+ masks[i, index] = 0
+ return masks
+
+
+def compute_position_id_with_mask(mask):
+ return torch.clip(torch.cumsum(mask, dim=-1) - 1, min=0, max=None)
+
+
+def normalize_pp_vpp_params(params, num_hidden_layers, layer_name='layers'):
+ """
+ Normalize the pp vpp params into a complete named parameters.
+ This is useful when gather parameters from pp ranks and passed to a model without pp
+
+ params: List[List[Dict[str, param]]]
+ params contains a list of pp, with a list of vpp named_parameters in each vpp chunk.
+ output: Dict[str, param]
+
+ """
+
+ def normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_layers):
+ """
+ Transform the model name in each model_chunk in each pp stage into the name in inference engine
+ """
+ if vpp_size > 1:
+ # print(f'try to bind vpp params to inference engine...')
+ layers_per_pp = num_layers // pp_size
+ layers_per_vpp = layers_per_pp // vpp_size
+ pp_offset = layers_per_vpp * pp_rank
+ vpp_offset = (layers_per_vpp * pp_size) * vpp_rank
+ layer_offset = pp_offset + vpp_offset
+ else:
+ layers_per_pp = num_layers // pp_size
+ layer_offset = layers_per_pp * pp_rank
+
+ if layer_name in name: # belong to an intermediate layer
+ split_name = name.split('.')
+ # find the num next to split_name
+ for i, name in enumerate(split_name):
+ if name == layer_name:
+ break
+ layer_num_idx = i + 1
+ # check the name
+ assert len(split_name) >= layer_num_idx + 1, f'split_name = {split_name}'
+ assert split_name[layer_num_idx].isdigit(), f'split_name = {split_name}'
+ # increment layer_num_idx by layer_offset
+ split_name[layer_num_idx] = str(int(split_name[layer_num_idx]) + layer_offset)
+ name = '.'.join(split_name) # weight name in inference_tp_model
+ return name
+
+ pp_size = len(params)
+ normalized_name_to_param = {}
+ for pp_rank in range(len(params)):
+ vpp_size = len(params[pp_rank])
+ for vpp_rank in range(vpp_size):
+ for name, param in params[pp_rank][vpp_rank].items():
+ normalized_name = normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_hidden_layers)
+ normalized_name_to_param[normalized_name] = param
+
+ return normalized_name_to_param
+
+
+def get_parallel_model_from_config(config, megatron_config, pre_process=None, post_process=None, value=False):
+ from megatron.core import ModelParallelConfig
+ assert isinstance(megatron_config, ModelParallelConfig)
+ model_class = _get_parallel_model_architecture_from_config(config, value)
+
+ model = model_class(config, megatron_config, pre_process=pre_process, post_process=post_process)
+ return model
+
+
+def _get_parallel_model_architecture_from_config(config: PretrainedConfig, value=False) -> Type[nn.Module]:
+ architectures = getattr(config, "architectures", [])
+ for arch in architectures:
+ model_cls = ModelRegistry.load_model_cls(arch, value)
+ if model_cls is not None:
+ return model_cls
+ raise ValueError(f"Model architectures {architectures} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def load_megatron_model_weights(config,
+ model_config,
+ parallel_model,
+ params_dtype,
+ is_value_model=False,
+ local_cache_path='~/.cache/verl/rlhf'):
+ assert hasattr(model_config, "architectures"), "architectures cannot be empty when load weight!"
+ architectures = getattr(model_config, "architectures", [])
+ local_cache_path = os.path.expanduser(local_cache_path)
+
+ if config.model.path.startswith("hdfs:"):
+ from verl.utils.fs import copy_local_path_from_hdfs
+ print(f'start download from {config.model.path}')
+ local_model_path = copy_local_path_from_hdfs(src=config.model.path, cache_dir=local_cache_path)
+ print('finish download')
+ else:
+ print(f"load from local dir {config.model.path}")
+ local_model_path = config.model.path
+
+ # TODO: to find a better way to load mistral7b-rm lm_head
+ if 'mistral7b-rm' in config.model.path:
+ model = MistralForSequenceClassification.from_pretrained(local_model_path) # use score head instead of lm_head
+ state_dict = model.state_dict()
+ state_dict['lm_head.weight'] = state_dict['score.weight']
+ state_dict['model.embed_tokens.weight'] = state_dict[
+ 'model.embed_tokens.weight'][:32000] # workaround, 32001 -> 32000
+ is_value_model = True
+ else:
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ model = AutoModelForCausalLM.from_pretrained(local_model_path)
+ state_dict = model.state_dict()
+
+ from verl.models.weight_loader_registry import get_weight_loader
+ print(f'before weight loader: architectures = {architectures}...')
+ for arch in architectures:
+ print(f'call weight loader arch = {arch}, model config = {model.config}')
+ weight_loader = get_weight_loader(arch)
+ weight_loader(state_dict=state_dict,
+ wrapped_models=parallel_model,
+ config=model.config,
+ params_dtype=params_dtype,
+ is_value_model=is_value_model)
+
+
+# pad input_ids_rmpad, cu_seqlens and max_seqlen_in_batch to be divisible by tp
+def pad_packed_inputs(unpad_tokens: torch.Tensor, cu_seqlens, max_seqlen_in_batch, size):
+ """pad the tokens such that the total length is a multiple of size.
+ This function is useful when applying sequence parallel and context parallel
+
+ Args:
+ unpad_tokens: (total_nnz, ...). Tokens after removing padding
+ cu_seqlens: (total_nnz + 1,)
+ max_seqlen_in_batch: int
+
+ Returns:
+
+ """
+ F = nn.functional
+
+ total_nnz = unpad_tokens.shape[0]
+
+ if total_nnz % size == 0:
+ pad_size = 0
+ else:
+ pad_size = size - total_nnz % size
+
+ # we assume adding a new data in the batch with seqlen pad_size
+ if pad_size > 0:
+ if unpad_tokens.ndim == 1:
+ unpad_tokens = F.pad(unpad_tokens, (0, pad_size))
+ elif unpad_tokens.ndim == 2:
+ unpad_tokens = F.pad(unpad_tokens, (0, 0, 0, pad_size))
+ else:
+ raise NotImplementedError(f'Padding dim {unpad_tokens.ndim()} is not supported')
+
+ cu_seqlens = F.pad(cu_seqlens, (0, 1), value=pad_size + cu_seqlens[-1])
+ max_seqlen_in_batch = max(max_seqlen_in_batch, pad_size)
+
+ return unpad_tokens, cu_seqlens, max_seqlen_in_batch
diff --git a/code/RL_model/verl/Search-R1/verl/utils/py_functional.py b/code/RL_model/verl/Search-R1/verl/utils/py_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5a0e176779cc19d3035a3af77a1bdf1f39349a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/py_functional.py
@@ -0,0 +1,56 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contain small python utility functions
+"""
+
+from typing import Dict
+from types import SimpleNamespace
+
+
+def union_two_dict(dict1: Dict, dict2: Dict):
+ """Union two dict. Will throw an error if there is an item not the same object with the same key.
+
+ Args:
+ dict1:
+ dict2:
+
+ Returns:
+
+ """
+ for key, val in dict2.items():
+ if key in dict1:
+ assert dict2[key] == dict1[key], \
+ f'{key} in meta_dict1 and meta_dict2 are not the same object'
+ dict1[key] = val
+
+ return dict1
+
+
+def append_to_dict(data: Dict, new_data: Dict):
+ for key, val in new_data.items():
+ if key not in data:
+ data[key] = []
+ data[key].append(val)
+
+
+class NestedNamespace(SimpleNamespace):
+
+ def __init__(self, dictionary, **kwargs):
+ super().__init__(**kwargs)
+ for key, value in dictionary.items():
+ if isinstance(value, dict):
+ self.__setattr__(key, NestedNamespace(value))
+ else:
+ self.__setattr__(key, value)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py b/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a75df6c37bc5a295aaa192b2a56cca2423e94b9
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py
@@ -0,0 +1,43 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contains commonly used utilities for ray
+"""
+
+import ray
+
+import concurrent.futures
+
+
+def parallel_put(data_list, max_workers=None):
+
+ def put_data(index, data):
+ return index, ray.put(data)
+
+ if max_workers is None:
+ max_workers = min(len(data_list), 16)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+ data_list_f = [executor.submit(put_data, i, data) for i, data in enumerate(data_list)]
+ res_lst = []
+ for future in concurrent.futures.as_completed(data_list_f):
+ res_lst.append(future.result())
+
+ # reorder based on index
+ output = [None for _ in range(len(data_list))]
+ for res in res_lst:
+ index, data_ref = res
+ output[index] = data_ref
+
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py b/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee45da0d33264ea40591f95a98bdf35ef0ea4ad
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py
@@ -0,0 +1,265 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple, Callable
+import heapq
+
+import torch
+from torch import distributed as dist
+
+from tensordict import TensorDict
+import copy
+
+
+def karmarkar_karp(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+ # see: https://en.wikipedia.org/wiki/Largest_differencing_method
+ class Set:
+
+ def __init__(self) -> None:
+ self.sum = 0
+ self.items = []
+
+ def add(self, idx: int, val: int):
+ self.items.append((idx, val))
+ self.sum += val
+
+ def merge(self, other):
+ for idx, val in other.items:
+ self.items.append((idx, val))
+ self.sum += val
+
+ def __lt__(self, other):
+ if self.sum != other.sum:
+ return self.sum < other.sum
+ if len(self.items) != len(other.items):
+ return len(self.items) < len(other.items)
+ return self.items < other.items
+
+ class State:
+
+ def __init__(self, items: List[Tuple[int, int]], k: int) -> None:
+ self.k = k
+ # sets should always be decreasing order
+ self.sets = [Set() for _ in range(k)]
+ assert len(items) in [1, k], f"{len(items)} not in [1, {k}]"
+ for i, (idx, seqlen) in enumerate(items):
+ self.sets[i].add(idx=idx, val=seqlen)
+ self.sets = sorted(self.sets, reverse=True)
+
+ def spread(self):
+ return self.sets[0].sum - self.sets[-1].sum
+
+ def get_partitions(self):
+ partitions = []
+ for i in range(len(self.sets)):
+ cur_partition = []
+ for idx, _ in self.sets[i].items:
+ cur_partition.append(idx)
+ partitions.append(cur_partition)
+ return partitions
+
+ def merge(self, other):
+ for i in range(self.k):
+ self.sets[i].merge(other.sets[self.k - 1 - i])
+ self.sets = sorted(self.sets, reverse=True)
+
+ @property
+ def spread(self) -> int:
+ return self.sets[0].sum - self.sets[-1].sum
+
+ def __lt__(self, other):
+ # least heap, let the state with largest spread to be popped first,
+ # if the spread is the same, let the state who has the largest set
+ # to be popped first.
+ if self.spread != other.spread:
+ return self.spread > other.spread
+ return self.sets[0] > other.sets[0]
+
+ def __repr__(self) -> str:
+ repr_str = "["
+ for i in range(self.k):
+ if i > 0:
+ repr_str += ","
+ repr_str += "{"
+ for j, (_, seqlen) in enumerate(self.sets[i].items):
+ if j > 0:
+ repr_str += ","
+ repr_str += str(seqlen)
+ repr_str += "}"
+ repr_str += "]"
+ return repr_str
+
+ sorted_seqlen_list = sorted([(seqlen, i) for i, seqlen in enumerate(seqlen_list)])
+ states_pq = []
+ if equal_size:
+ assert len(seqlen_list) % k_partitions == 0, f"{len(seqlen_list)} % {k_partitions} != 0"
+ for offset in range(0, len(sorted_seqlen_list), k_partitions):
+ items = []
+ for i in range(k_partitions):
+ seqlen, idx = sorted_seqlen_list[offset + i]
+ items.append((idx, seqlen))
+ heapq.heappush(states_pq, State(items=items, k=k_partitions))
+ else:
+ for seqlen, idx in sorted_seqlen_list:
+ heapq.heappush(states_pq, State(items=[(idx, seqlen)], k=k_partitions))
+
+ while len(states_pq) > 1:
+ state0 = heapq.heappop(states_pq)
+ state1 = heapq.heappop(states_pq)
+ # merge states
+ state0.merge(state1)
+ heapq.heappush(states_pq, state0)
+
+ final_state = states_pq[0]
+ partitions = final_state.get_partitions()
+ if equal_size:
+ for i, partition in enumerate(partitions):
+ assert len(partition) * \
+ k_partitions == len(seqlen_list), f"{len(partition)} * {k_partitions} != {len(seqlen_list)}"
+ return partitions
+
+
+def greedy_partition(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+ bias = sum(seqlen_list) + 1 if equal_size else 0
+ sorted_seqlen = [(seqlen + bias, i) for i, seqlen in enumerate(seqlen_list)]
+ partitions = [[] for _ in range(k_partitions)]
+ partition_sums = [0 for _ in range(k_partitions)]
+ for seqlen, i in sorted_seqlen:
+ min_idx = None
+ for j in range(k_partitions):
+ if min_idx is None or partition_sums[j] < partition_sums[min_idx]:
+ min_idx = j
+ partitions[min_idx].append(i)
+ partition_sums[min_idx] += seqlen
+ if equal_size:
+ for i, partition in enumerate(partitions):
+ assert len(partition) * \
+ k_partitions == len(seqlen_list), f"{len(partition)} * {k_partitions} != {len(seqlen_list)}"
+ return partitions
+
+
+def get_seqlen_balanced_partitions(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+ """ get order of seq lengths to make partitions balanced, this is
+ used in balacing sum of seqlength across dp ranks and microbatches
+ Parameters:
+ seqlen_list (List[int]):
+ seq lengths of each items
+ k_partitions (int):
+ resulting number of partitions
+ equal_size (bool):
+ if True, number of items in each partitions must be equal.
+ if False, only consider balancing the sum, each partition can have
+ variable number of items
+ Returns:
+ partitions (List[List[int]]):
+ return k_partitions list containing the index of items.
+ """
+ assert len(seqlen_list) >= k_partitions, f"number of items:[{len(seqlen_list)}] < k_partitions:[{k_partitions}]"
+
+ def _check_and_sort_partitions(partitions):
+ assert len(partitions) == k_partitions, f"{len(partitions)} != {k_partitions}"
+ seen_idx = set()
+ sorted_partitions = [None] * k_partitions
+ for i, partition in enumerate(partitions):
+ assert len(partition) > 0, f"the {i}-th partition is empty"
+ for idx in partition:
+ seen_idx.add(idx)
+ sorted_partitions[i] = sorted(partition)
+ assert seen_idx == set(range(len(seqlen_list)))
+ return sorted_partitions
+
+ partitions = karmarkar_karp(seqlen_list=seqlen_list, k_partitions=k_partitions, equal_size=equal_size)
+ return _check_and_sort_partitions(partitions)
+
+
+def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], prefix):
+ # add some metrics of seqlen sum on dp ranks
+ k_partition = len(partitions)
+ # assert len(seqlen_list) % k_partition == 0
+ batch_size = len(seqlen_list) // k_partition
+ min_sum_seqlen = None
+ max_sum_seqlen = None
+ total_sum_seqlen = 0
+ for offset in range(0, len(seqlen_list), batch_size):
+ cur_sum_seqlen = sum(seqlen_list[offset:offset + batch_size])
+ if min_sum_seqlen is None or cur_sum_seqlen < min_sum_seqlen:
+ min_sum_seqlen = cur_sum_seqlen
+ if max_sum_seqlen is None or cur_sum_seqlen > max_sum_seqlen:
+ max_sum_seqlen = cur_sum_seqlen
+ total_sum_seqlen += cur_sum_seqlen
+
+ balanced_sum_seqlen_list = []
+ for partition in partitions:
+ cur_sum_seqlen_balanced = sum([seqlen_list[i] for i in partition])
+ balanced_sum_seqlen_list.append(cur_sum_seqlen_balanced)
+ # print("balanced_sum_seqlen_list: ", balanced_sum_seqlen_list)
+ min_sum_seqlen_balanced = min(balanced_sum_seqlen_list)
+ max_sum_seqlen_balanced = max(balanced_sum_seqlen_list)
+
+ return {
+ f'{prefix}/min': min_sum_seqlen,
+ f'{prefix}/max': max_sum_seqlen,
+ f'{prefix}/minmax_diff': max_sum_seqlen - min_sum_seqlen,
+ f'{prefix}/balanced_min': min_sum_seqlen_balanced,
+ f'{prefix}/balanced_max': max_sum_seqlen_balanced,
+ f'{prefix}/mean': total_sum_seqlen / len(partitions)
+ }
+
+
+def ceildiv(a, b):
+ return -(a // -b)
+
+
+def rearrange_micro_batches(batch: TensorDict, max_token_len, dp_group=None):
+ """Split the batch into a list of micro_batches, where the max_token_len is smaller than max_token_len
+ and the number of valid tokens in each micro batch is well balanced.
+ """
+ # this is per local micro_bsz
+ max_seq_len = batch['attention_mask'].shape[-1]
+ assert max_token_len >= max_seq_len, \
+ f'max_token_len must be greater than the sequence length. Got {max_token_len=} and {max_seq_len=}'
+
+ seq_len_effective: torch.Tensor = batch['attention_mask'].sum(dim=1)
+ total_seqlen = seq_len_effective.sum().item()
+ num_micro_batches = ceildiv(total_seqlen, max_token_len)
+ if dist.is_initialized():
+ num_micro_batches = torch.tensor([num_micro_batches], device='cuda')
+ dist.all_reduce(num_micro_batches, op=dist.ReduceOp.MAX, group=dp_group)
+ num_micro_batches = num_micro_batches.cpu().item()
+
+ seq_len_effective = seq_len_effective.tolist()
+ assert num_micro_batches <= len(seq_len_effective)
+
+ micro_bsz_idx = get_seqlen_balanced_partitions(seq_len_effective, num_micro_batches, equal_size=False)
+
+ micro_batches = []
+
+ for partition in micro_bsz_idx:
+ curr_micro_batch = []
+ for idx in partition:
+ curr_micro_batch.append(batch[idx:idx + 1])
+ curr_micro_batch = torch.cat(curr_micro_batch)
+
+ micro_batches.append(curr_micro_batch)
+
+ return micro_batches, micro_bsz_idx
+
+
+def get_reverse_idx(idx_map):
+ reverse_idx_map = copy.deepcopy(idx_map)
+
+ for i, idx in enumerate(idx_map):
+ reverse_idx_map[idx] = i
+
+ return reverse_idx_map
diff --git a/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py b/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b64b6623ac62b6b3f4288dccf8f5307fc87439c7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py
@@ -0,0 +1,58 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for tokenization."""
+import warnings
+
+__all__ = ['hf_tokenizer']
+
+
+def set_pad_token_id(tokenizer):
+ """Set pad_token_id to eos_token_id if it is None.
+
+ Args:
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer to be set.
+
+ """
+ if tokenizer.pad_token_id is None:
+ tokenizer.pad_token_id = tokenizer.eos_token_id
+ warnings.warn(f'tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}')
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ warnings.warn(f'tokenizer.pad_token is None. Now set to {tokenizer.eos_token}')
+
+
+def hf_tokenizer(name_or_path, correct_pad_token=True, correct_gemma2=True, **kwargs):
+ """Create a huggingface pretrained tokenizer.
+
+ Args:
+ name (str): The name of the tokenizer.
+ correct_pad_token (bool): Whether to correct the pad token id.
+ correct_gemma2 (bool): Whether to correct the gemma2 tokenizer.
+ **kwargs: The keyword arguments for the tokenizer.
+
+ Returns:
+ transformers.PreTrainedTokenizer: The pretrained tokenizer.
+
+ """
+ from transformers import AutoTokenizer
+ if correct_gemma2 and isinstance(name_or_path, str) and 'gemma-2-2b-it' in name_or_path:
+ # the EOS token in gemma2 is ambiguious, which may worsen RL performance.
+ # https://huggingface.co/google/gemma-2-2b-it/commit/17a01657f5c87135bcdd0ec7abb4b2dece04408a
+ warnings.warn('Found gemma-2-2b-it tokenizer. Set eos_token and eos_token_id to and 107.')
+ kwargs['eos_token'] = ''
+ kwargs['eos_token_id'] = 107
+ tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
+ if correct_pad_token:
+ set_pad_token_id(tokenizer)
+ return tokenizer
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py b/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb63df13b9c26802dff23c92ae8e36f5c23ae4fd
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py
@@ -0,0 +1,82 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Adapted from Cruise.
+"""
+
+import torch
+
+from typing import Union
+
+HALF_LIST = [16, "16", "fp16", "float16"]
+FLOAT_LIST = [32, "32", "fp32", "float32"]
+BFLOAT_LIST = ["bf16", "bfloat16"]
+
+
+class PrecisionType(object):
+ """Type of precision used.
+
+ >>> PrecisionType.HALF == 16
+ True
+ >>> PrecisionType.HALF in (16, "16")
+ True
+ """
+
+ HALF = "16"
+ FLOAT = "32"
+ FULL = "64"
+ BFLOAT = "bf16"
+ MIXED = "mixed"
+
+ @staticmethod
+ def supported_type(precision: Union[str, int]) -> bool:
+ return any(x == precision for x in PrecisionType)
+
+ @staticmethod
+ def supported_types() -> list[str]:
+ return [x.value for x in PrecisionType]
+
+ @staticmethod
+ def is_fp16(precision):
+ return precision in HALF_LIST
+
+ @staticmethod
+ def is_fp32(precision):
+ return precision in FLOAT_LIST
+
+ @staticmethod
+ def is_bf16(precision):
+ return precision in BFLOAT_LIST
+
+ @staticmethod
+ def to_dtype(precision):
+ if precision in HALF_LIST:
+ return torch.float16
+ elif precision in FLOAT_LIST:
+ return torch.float32
+ elif precision in BFLOAT_LIST:
+ return torch.bfloat16
+ else:
+ raise RuntimeError(f"unexpected precision: {precision}")
+
+ @staticmethod
+ def to_str(precision):
+ if precision == torch.float16:
+ return 'fp16'
+ elif precision == torch.float32:
+ return 'fp32'
+ elif precision == torch.bfloat16:
+ return 'bf16'
+ else:
+ raise RuntimeError(f"unexpected precision: {precision}")
diff --git a/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py b/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d53ca7a4e40efc715ceba1f3a8c725c2fe256a0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py
@@ -0,0 +1,492 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contain small torch utilities
+"""
+
+from typing import Dict, Union, List, Optional
+
+import os
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from tensordict import TensorDict
+from torch import nn
+
+try:
+ from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+ FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE = True
+except ImportError:
+ FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE = False
+
+
+def gather_from_labels(data, label):
+ """Gather the label from data. The value in label should be [0, vocab_size)
+
+ Args:
+ data: (..., vocab_size)
+ label (torch.IntTensor) : (...,)
+
+ Returns:
+
+ """
+
+ output = torch.gather(data, -1, label.unsqueeze(-1)).squeeze(-1)
+ return output
+
+
+def logprobs_from_logits(logits, labels):
+ """
+ See: https://github.com/pytorch/pytorch/issues/563#issuecomment-330103591
+ """
+ if FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE:
+ batch_dim = logits.shape[:-1]
+ last_dim = logits.shape[-1]
+ logits = logits.reshape(-1, last_dim)
+ labels = labels.reshape(-1)
+ output = logprobs_from_logits_flash_attn(logits, labels)
+ output = output.view(*batch_dim)
+ else:
+ output = logprobs_from_logits_naive(logits, labels)
+ return output
+
+
+def logprobs_from_logits_flash_attn(logits, labels):
+ output = -cross_entropy_loss(logits, labels)[0]
+ return output
+
+
+def logprobs_from_logits_naive(logits, labels):
+ logp = F.log_softmax(logits, dim=-1)
+ logpy = gather_from_labels(logp, labels)
+ return logpy
+
+
+def logprobs_of_labels_v2(logits: torch.FloatTensor, labels):
+ """
+ A memory efficient implementation of logprobs_from_logits
+ """
+ assert logits.dtype == torch.float32, 'Using bf16 logits with logprobs_of_labels_v2 may lead to divergence'
+ logprobs_labels = torch.gather(logits, dim=-1, index=labels.unsqueeze(-1))
+ logprobs_labels = logprobs_labels - torch.logsumexp(logits, dim=-1, keepdim=True)
+ return logprobs_labels.squeeze(-1)
+
+
+def clip_by_value(x, tensor_min, tensor_max):
+ """
+ Tensor extenstion to torch.clamp
+ https://github.com/pytorch/pytorch/issues/2793#issuecomment-428784713
+ """
+ clipped = torch.max(torch.min(x, tensor_max), tensor_min)
+ return clipped
+
+
+def entropy_from_logits(logits: torch.Tensor):
+ """Calculate entropy from logits."""
+ pd = torch.nn.functional.softmax(logits, dim=-1)
+ entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1)
+ return entropy
+
+
+def masked_sum(values, mask, axis=None):
+ """Compute mean of tensor with a masked values."""
+ return (values * mask).sum(axis=axis)
+
+
+def masked_mean(values, mask, axis=None):
+ """Compute mean of tensor with a masked values."""
+ return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
+
+
+def masked_var(values, mask, unbiased=True):
+ """Compute variance of tensor with masked values."""
+ mean = masked_mean(values, mask)
+ centered_values = values - mean
+ variance = masked_mean(centered_values**2, mask)
+ if unbiased:
+ mask_sum = mask.sum()
+ if mask_sum == 0:
+ raise ValueError("At least one element in the mask has to be 1.")
+ # note that if mask_sum == 1, then there is a division by zero issue
+ # to avoid it you just need to use a larger minibatch_size
+ if mask_sum == 1:
+ raise ValueError("The sum of the mask is one, which can cause a division by zero.")
+ bessel_correction = mask_sum / (mask_sum - 1)
+ variance = variance * bessel_correction
+ return variance
+
+
+def masked_whiten(values, mask, shift_mean=True):
+ """Whiten values with masked values."""
+ mean, var = masked_mean(values, mask), masked_var(values, mask)
+ whitened = (values - mean) * torch.rsqrt(var + 1e-8)
+ if not shift_mean:
+ whitened += mean
+ return whitened
+
+
+def get_eos_mask(response_id: torch.Tensor, eos_token: int = 2, dtype=torch.int64):
+ '''
+ e.g. end of sentence token=1
+ response_id: [0, 0, 2, 42, 3, 5, 1, 0, 0]
+ eos_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0]
+ '''
+ eos_mask = response_id.eq(eos_token).long()
+ eos_mask = (torch.cumsum(eos_mask, dim=1) - eos_mask).bool()
+ eos_mask = torch.logical_not(eos_mask).to(dtype)
+ return eos_mask
+
+
+def compute_grad_norm(model: nn.Module):
+ total_grad_square = 0
+ total_params = 0
+ for param in model.parameters():
+ if param.grad is not None:
+ total_grad_square += torch.sum(torch.square(param.grad.detach())).item()
+ return total_grad_square
+
+
+def broadcast_dict_tensor(tensors: Union[Dict[str, torch.Tensor], TensorDict], src, group):
+ """
+ TODO: optimize this. Technically, we only need one broadcast
+ """
+
+ for key in tensors.sorted_keys:
+ torch.distributed.broadcast(tensors[key], src=src, group=group, async_op=False)
+
+
+def allgather_dict_tensors(tensors: Union[Dict[str, torch.Tensor], TensorDict], size, group, dim=0):
+ """
+ TODO: optimize this.
+ - We can use async ops
+ - We can use only one allgather
+ Args:
+ tensors:
+ size:
+ group:
+
+ Returns:
+
+ """
+ if isinstance(tensors, TensorDict):
+ is_tensor_dict = True
+ tensors_as_dict = tensors.to_dict()
+ else:
+ tensors_as_dict = tensors
+ is_tensor_dict = False
+
+ output = {}
+ sorted_keys = sorted(tensors_as_dict.keys())
+ for key in sorted_keys:
+ val = tensors_as_dict[key]
+ output[key] = [torch.empty_like(val) for _ in range(size)]
+ torch.distributed.all_gather(output[key], val, group=group, async_op=False)
+ output[key] = torch.cat(output[key], dim=dim)
+
+ if is_tensor_dict:
+ output = TensorDict(source=output, batch_size=tensors.batch_size[0] * size)
+
+ return output
+
+
+def split_dict_tensor_into_batches(tensors: TensorDict, batch_size) -> List[TensorDict]:
+ assert tensors.batch_size[0] % batch_size == 0, \
+ f'input data batch size: {tensors.batch_size[0]}, split batch size: {batch_size}'
+ return tensors.split(batch_size)
+
+
+def pad_sequence_to_length(tensors, max_seq_len, pad_token_id, left_pad=False):
+ """
+ pad a 2D tensors (e.g. responses, logprobs) in the last dim to max_seq_length.
+ input shape: [bs, seq_length]
+ output shape: [bs, max_seq_length]
+ (0, max_seq_len - tensors.shape[-1]) means right pad to max_seq_length and no left pad
+ """
+ if tensors.shape[-1] >= max_seq_len:
+ return tensors
+ pad_tuple = (max_seq_len - tensors.shape[-1], 0) if left_pad else (0, max_seq_len - tensors.shape[-1])
+ return F.pad(tensors, pad_tuple, 'constant', pad_token_id)
+
+
+from transformers import PreTrainedTokenizer
+
+
+def tokenize_and_postprocess_data(prompt: str,
+ tokenizer: PreTrainedTokenizer,
+ max_length: int,
+ pad_token_id: int,
+ left_pad=True,
+ truncation='error'):
+ """
+ input_data is the output from tokenizer.
+ """
+ assert truncation in ['left', 'right', 'error']
+
+ input_data = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
+
+ input_ids = input_data['input_ids']
+ attention_mask = input_data['attention_mask']
+
+ assert input_ids.ndim == 2
+
+ sequence_length = input_ids.shape[-1]
+ if sequence_length < max_length:
+ input_ids = pad_sequence_to_length(input_ids,
+ max_seq_len=max_length,
+ pad_token_id=pad_token_id,
+ left_pad=left_pad)
+ attention_mask = pad_sequence_to_length(attention_mask,
+ max_seq_len=max_length,
+ pad_token_id=0,
+ left_pad=left_pad)
+ elif sequence_length > max_length:
+ if truncation == 'left':
+ # actually, left truncation may not be reasonable
+ input_ids = input_ids[:, -max_length:]
+ attention_mask = attention_mask[:, -max_length:]
+ elif truncation == 'right':
+ input_ids = input_ids[:, :max_length]
+ attention_mask = attention_mask[:, :max_length]
+ elif truncation == 'error':
+ raise NotImplementedError(f'{sequence_length=} is larger than {max_length=}')
+ else:
+ raise NotImplementedError(f'Unknown truncation method {truncation}')
+
+ return input_ids, attention_mask
+
+
+def remove_pad_token(input_ids: torch.Tensor, attention_mask: torch.Tensor):
+ """ Remove the pad token.
+
+ Args:
+ input_ids shape: [bs, seq_length]
+ attention_mask shape: [bs, seq_length]
+ Returns:
+ no_padding_batch(List[List[int]]): contains the rmpad token ids per query.
+ """
+ no_padding_batch = []
+ for ids, mask in zip(input_ids, attention_mask):
+ no_padding_batch.append((ids[len(ids) - mask.sum():]).cpu().numpy().tolist())
+ return no_padding_batch
+
+
+def log_probs_from_logits_response(input_ids, logits, response_length):
+ """Compute the response log_probs from full logits. Note that logits = model(input_ids)
+
+ Args:
+ input_ids: [batch_size, seqlen]
+ logits: [batch_size, seqlen, vocab_size]
+
+ Returns:
+ response_log_prob:
+ """
+ response_logits = logits[:, -response_length - 1:-1]
+ response = input_ids[:, -response_length:]
+ response_log_prob = logprobs_from_logits(logits=response_logits, labels=response)
+ return response_log_prob
+
+
+def log_probs_from_logits_response_rmpad(input_ids, attention_mask, logits_rmpad, response_length):
+ """Compute the log_probs from logits with rmpad logits and pad input. Note that
+ logits_rmpad = model(input_ids_rmpad). For each sentences, there is a shift between
+ logits and input_ids.
+ The reason for this function to is to compute logprobs_from_logits in rmpad mode because it is memory-intensive
+ for large vocab_size
+
+ Args:
+ input_ids: [batch_size, seqlen]
+ attention_mask: [batch_size, seqlen]
+ logits_rmpad: [total_nnz, vocab_size]
+ response_length: int
+ """
+ from flash_attn.bert_padding import pad_input, unpad_input
+
+ batch_size, seqlen = input_ids.shape
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask=attention_mask)
+ input_ids_rmpad = input_ids_rmpad.squeeze(-1)
+ input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0)
+ full_log_probs_rmpad = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled) # (total_nnz,)
+ full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+ output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length]
+ return output
+
+
+def log_probs_from_logits_all_rmpad(input_ids_rmpad, logits_rmpad, indices, batch_size, seqlen, response_length):
+ """Compute the log_probs from logits with rmpad input_ids and logits. Note that
+ logits_rmpad = model(input_ids_rmpad). For each sentences, there is a shift between
+ logits and input_ids.
+ The reason for this function to is to compute logprobs_from_logits in rmpad mode because it is memory-intensive
+ for large vocab_size
+
+ Args:
+ input_ids_rmpad: [1, total_nnz]
+ logits_rmpad: [total_nnz, vocab_size]
+ indices: [total_nnz]
+ batch_size: int
+ seqlen: int
+ response_length: int
+ """
+ from flash_attn.bert_padding import pad_input
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # transpose back to [total_nnz, 1]
+ input_ids_rmpad = input_ids_rmpad.squeeze(-1)
+ input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0)
+ full_log_probs_rmpad = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled) # (total_nnz,)
+ full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+ output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length]
+ return output
+
+
+from transformers.generation.logits_process import (TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper)
+
+
+def post_process_logits(input_ids, logits, temperature, top_k, top_p):
+ if temperature != 1.:
+ logits = logits.div_(temperature) # inplace operation to avoid OOM
+ # TODO: add them back
+ # if top_k is not None and top_k > 0:
+ # logits = TopKLogitsWarper(top_k=top_k)(input_ids, logits)
+ # if top_p is not None and top_p < 1.0 and top_p > 0.0:
+ # logits = TopPLogitsWarper(top_p=top_p)(input_ids, logits)
+ return logits
+
+
+"""
+Optimizer related
+"""
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+import math
+
+
+def get_cosine_schedule_with_warmup(
+ optimizer: Optimizer,
+ num_warmup_steps: int,
+ num_training_steps: int,
+ min_lr_ratio: float = 0.0,
+ num_cycles: float = 0.5,
+ last_epoch: int = -1,
+):
+ """
+ Create a schedule with a learning rate that decreases following the values of the cosine function between the
+ initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+ initial lr set in the optimizer.
+ Args:
+ optimizer (:class:`~torch.optim.Optimizer`):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (:obj:`int`):
+ The number of steps for the warmup phase.
+ num_training_steps (:obj:`int`):
+ The total number of training steps.
+ min_lr_ratio (:obj:`float`, `optional`, defaults to 0.0):
+ The minimum lr ratio w.r.t the maximum.
+ num_cycles (:obj:`float`, `optional`, defaults to 0.5):
+ The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+ following a half-cosine).
+ last_epoch (:obj:`int`, `optional`, defaults to -1):
+ The index of the last epoch when resuming training.
+ Return:
+ :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+ assert min_lr_ratio >= 0 and min_lr_ratio <= 1.
+ coef = (1 - min_lr_ratio) * 0.5
+ intercept = (1 + min_lr_ratio) * 0.5
+
+ def lr_lambda(current_step):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1, num_warmup_steps))
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+ x = math.cos(math.pi * float(num_cycles) * 2.0 * progress)
+ return max(0.0, x * coef + intercept)
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_constant_schedule_with_warmup(
+ optimizer: Optimizer,
+ num_warmup_steps: int,
+ last_epoch: int = -1,
+):
+
+ def lr_lambda(current_step):
+ return min(1, float(current_step) / float(max(1, num_warmup_steps)))
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype,
+ tgt_len=input_shape[-1]).to(inputs_embeds.device)
+ combined_attention_mask = (expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask +
+ combined_attention_mask)
+
+ return combined_attention_mask
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+def get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
diff --git a/code/RL_model/verl/Search-R1/verl/utils/tracking.py b/code/RL_model/verl/Search-R1/verl/utils/tracking.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1fbd6f330451b89286644e226fb743237bc436c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/tracking.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A unified tracking interface that supports logging data to different backend
+"""
+import dataclasses
+from enum import Enum
+from functools import partial
+from pathlib import Path
+from typing import List, Union, Dict, Any
+
+
+class Tracking(object):
+ supported_backend = ['wandb', 'mlflow', 'console']
+
+ def __init__(self, project_name, experiment_name, default_backend: Union[str, List[str]] = 'console', config=None):
+ if isinstance(default_backend, str):
+ default_backend = [default_backend]
+ for backend in default_backend:
+ if backend == 'tracking':
+ import warnings
+ warnings.warn("`tracking` logger is deprecated. use `wandb` instead.", DeprecationWarning)
+ else:
+ assert backend in self.supported_backend, f'{backend} is not supported'
+
+ self.logger = {}
+
+ if 'tracking' in default_backend or 'wandb' in default_backend:
+ import wandb
+ import os
+ WANDB_API_KEY = os.environ.get("WANDB_API_KEY", None)
+ if WANDB_API_KEY:
+ wandb.login(key=WANDB_API_KEY)
+ wandb.init(project=project_name, name=experiment_name, config=config)
+ self.logger['wandb'] = wandb
+
+ if 'mlflow' in default_backend:
+ import mlflow
+ mlflow.start_run(run_name=experiment_name)
+ mlflow.log_params(_compute_mlflow_params_from_objects(config))
+ self.logger['mlflow'] = _MlflowLoggingAdapter()
+
+ if 'console' in default_backend:
+ from verl.utils.logger.aggregate_logger import LocalLogger
+ self.console_logger = LocalLogger(print_to_console=True)
+ self.logger['console'] = self.console_logger
+
+ def log(self, data, step, backend=None):
+ for default_backend, logger_instance in self.logger.items():
+ if backend is None or default_backend in backend:
+ logger_instance.log(data=data, step=step)
+
+
+class _MlflowLoggingAdapter:
+
+ def log(self, data, step):
+ import mlflow
+ mlflow.log_metrics(metrics=data, step=step)
+
+
+def _compute_mlflow_params_from_objects(params) -> Dict[str, Any]:
+ if params is None:
+ return {}
+
+ return _flatten_dict(_transform_params_to_json_serializable(params, convert_list_to_dict=True), sep='/')
+
+
+def _transform_params_to_json_serializable(x, convert_list_to_dict: bool):
+ _transform = partial(_transform_params_to_json_serializable, convert_list_to_dict=convert_list_to_dict)
+
+ if dataclasses.is_dataclass(x):
+ return _transform(dataclasses.asdict(x))
+ if isinstance(x, dict):
+ return {k: _transform(v) for k, v in x.items()}
+ if isinstance(x, list):
+ if convert_list_to_dict:
+ return {'list_len': len(x)} | {f'{i}': _transform(v) for i, v in enumerate(x)}
+ else:
+ return [_transform(v) for v in x]
+ if isinstance(x, Path):
+ return str(x)
+ if isinstance(x, Enum):
+ return x.value
+
+ return x
+
+
+def _flatten_dict(raw: Dict[str, Any], *, sep: str) -> Dict[str, Any]:
+ import pandas as pd
+ ans = pd.json_normalize(raw, sep=sep).to_dict(orient='records')[0]
+ assert isinstance(ans, dict)
+ return ans
diff --git a/code/RL_model/verl/Search-R1/verl/utils/ulysses.py b/code/RL_model/verl/Search-R1/verl/utils/ulysses.py
new file mode 100644
index 0000000000000000000000000000000000000000..c085becc591d29a9517966cdee601843bdf24371
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/ulysses.py
@@ -0,0 +1,288 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for DeepSpeed Ulysses Sequence Parallelism.
+DeepSpeed Ulysses Paper: https://arxiv.org/abs/2309.14509
+Inspired from: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py
+"""
+from typing import Any, Optional, List, Tuple
+
+import torch
+from torch import Tensor
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+_ULYSSES_SEQUENCE_PARALLEL_GROUP = None
+
+
+def set_ulysses_sequence_parallel_group(group: dist.ProcessGroup):
+ """
+ Set ulysses sequence parallel process group.
+ """
+ global _ULYSSES_SEQUENCE_PARALLEL_GROUP
+ _ULYSSES_SEQUENCE_PARALLEL_GROUP = group
+
+
+def get_ulysses_sequence_parallel_group() -> Optional[dist.ProcessGroup]:
+ """
+ Get ulysses sequence parallel process group.
+ """
+ global _ULYSSES_SEQUENCE_PARALLEL_GROUP
+ return _ULYSSES_SEQUENCE_PARALLEL_GROUP
+
+
+def get_ulysses_sequence_parallel_world_size(group: ProcessGroup = None) -> int:
+ """
+ Get ulysses sequence parallel world size.
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ return dist.get_world_size(group) if group else 1
+
+
+def get_ulysses_sequence_parallel_rank(group: ProcessGroup = None) -> int:
+ """
+ Get ulysses sequence parallel rank.
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ return dist.get_rank(group) if group else 0
+
+
+def gather_seq_scatter_heads(
+ x: Tensor,
+ seq_dim: int,
+ head_dim: int,
+ unpadded_dim_size: int = 0,
+ group: ProcessGroup = None,
+) -> Tensor:
+ """
+ A func to sync embedding input with alltoall in sequence parallel
+ gather sequence dimension and scatter head dim:
+ e.g. seq_dim: 1, head_dim: 2
+ [bsz, seq/n, h, ...] -> [bsz, seq, h/n, ...]
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ if not group:
+ return x
+ sp_world = get_ulysses_sequence_parallel_world_size(group)
+ x = SeqAllToAll.apply(group, x, head_dim, seq_dim)
+ if unpadded_dim_size and unpadded_dim_size % sp_world != 0:
+ padding_size = x.size(seq_dim) - unpadded_dim_size
+ x = _unpad_tensor(x, seq_dim, padding_size)
+ return x
+
+
+def gather_heads_scatter_seq(x: Tensor, head_dim: int, seq_dim: int, group: ProcessGroup = None) -> Tensor:
+ """
+ A func to sync attention result with alltoall in sequence parallel
+ gather head dimension and scatter seq dim:
+ e.g. seq_dim: 1, head_dim: 2
+ [bsz, seq, h/n, ...] -> [bsz, seq/n, h, ...]
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ if not group:
+ return x
+ dim_size = x.size(seq_dim)
+ sp_world = get_ulysses_sequence_parallel_world_size(group)
+ if dim_size % sp_world != 0:
+ padding_size = sp_world - (dim_size % sp_world)
+ x = _pad_tensor(x, seq_dim, padding_size)
+ return SeqAllToAll.apply(group, x, seq_dim, head_dim, False)
+
+
+def _pad_tensor(x: Tensor, dim: int, padding_size: int) -> Tensor:
+ shape = list(x.shape)
+ shape[dim] = padding_size
+ pad = torch.zeros(shape, dtype=x.dtype, device=x.device)
+ return torch.cat([x, pad], dim=dim)
+
+
+def _unpad_tensor(x: Tensor, dim: int, padding_size: int) -> Tensor:
+ slc = [slice(None)] * len(x.shape)
+ slc[dim] = slice(0, -padding_size)
+ return x[slc]
+
+
+def slice_input_tensor(x: Tensor, dim: int, padding: bool = True, group: ProcessGroup = None) -> Tensor:
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ sp_world_size = dist.get_world_size(group)
+ sp_rank = get_ulysses_sequence_parallel_rank()
+ dim_size = x.size(dim)
+ # pad before slice
+ if padding and dim_size % sp_world_size:
+ padding_size = sp_world_size - (dim_size % sp_world_size)
+ x = _pad_tensor(x, dim, padding_size)
+ # slice the input tensor
+ parts = x.size(dim) // sp_world_size
+ slc = [slice(None)] * len(x.shape)
+ slc[dim] = slice(sp_rank * parts, (sp_rank + 1) * parts)
+ return x[slc].contiguous()
+
+
+def all_to_all_tensor(
+ local_input: Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+ group: Optional[dist.ProcessGroup] = None,
+ async_op: bool = False,
+):
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ seq_world_size = dist.get_world_size(group)
+ input_list = [t.contiguous() for t in torch.tensor_split(local_input, seq_world_size, scatter_dim)]
+ output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)]
+ comm = dist.all_to_all(output_list, input_list, group=group, async_op=async_op)
+ if async_op:
+
+ def wait():
+ comm.wait()
+ return torch.cat(output_list, dim=gather_dim).contiguous()
+
+ return wait
+ return torch.cat(output_list, dim=gather_dim).contiguous()
+
+
+def all_gather_tensor(local_tensor: Tensor, group: Optional[dist.ProcessGroup] = None, async_op: bool = False):
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ sp_world_size = dist.get_world_size(group=group)
+ output_shape = list(local_tensor.shape)
+ output_shape[0] = output_shape[0] * sp_world_size
+ output = torch.empty(output_shape, dtype=local_tensor.dtype, device=local_tensor.device)
+ dist.all_gather_into_tensor(output, local_tensor, group=group, async_op=async_op)
+ return output
+
+
+class SeqAllToAll(torch.autograd.Function):
+
+ @staticmethod
+ def forward(
+ ctx: Any,
+ group: dist.ProcessGroup,
+ local_input: Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+ async_op: bool = False,
+ ) -> Tensor:
+ ctx.group = group
+ ctx.scatter_dim = scatter_dim
+ ctx.gather_dim = gather_dim
+ ctx.async_op = async_op
+ return all_to_all_tensor(local_input, scatter_dim, gather_dim, group, async_op)
+
+ @staticmethod
+ def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+ if ctx.async_op:
+ input_t = torch.cat(grad_output[1:], dim=ctx.gather_dim).contiguous()
+ else:
+ input_t = grad_output[0]
+ return (
+ None,
+ all_to_all_tensor(input_t, ctx.gather_dim, ctx.scatter_dim, ctx.group, False),
+ None,
+ None,
+ None,
+ None,
+ )
+
+
+class Gather(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx: Any,
+ group: dist.ProcessGroup,
+ local_tensor: Tensor,
+ gather_dim: int,
+ grad_scaler: bool = True,
+ async_op=False) -> Tensor:
+ ctx.group = group
+ ctx.gather_dim = gather_dim
+ ctx.grad_scaler = grad_scaler
+ ctx.async_op = async_op
+
+ sp_world_size = dist.get_world_size(group=group)
+ ctx.sp_world_size = sp_world_size
+
+ sp_rank = dist.get_rank(group=group)
+ ctx.sp_rank = sp_rank
+
+ local_shape = list(local_tensor.size())
+ split_size = local_shape[0]
+ part_size = local_shape[gather_dim] # store original size
+ ctx.part_size = part_size
+
+ output = all_gather_tensor(local_tensor, group, async_op)
+ return torch.cat(output.split(split_size, dim=0), dim=gather_dim)
+
+ @staticmethod
+ def backward(ctx: Any, grad_output: Tensor) -> Any:
+ if ctx.grad_scaler:
+ grad_output = grad_output * ctx.sp_world_size
+ return (None, grad_output.split(ctx.part_size,
+ dim=ctx.gather_dim)[ctx.sp_rank].contiguous(), None, None, None, None)
+
+
+def gather_outpus_and_unpad(x: Tensor,
+ gather_dim: int,
+ unpad_dim: int = None,
+ padding_size: int = 0,
+ grad_scaler: bool = True,
+ group: Optional[dist.ProcessGroup] = None):
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ sp_size = get_ulysses_sequence_parallel_world_size()
+ if group == None:
+ return x
+ x = Gather.apply(group, x, gather_dim, grad_scaler)
+ if unpad_dim is not None:
+ assert isinstance(padding_size, int), 'padding size is not given or is not an integer'
+ if padding_size == 0:
+ return x
+ x = _unpad_tensor(x, unpad_dim, padding_size)
+ return x
+
+
+def ulysses_pad_and_slice_inputs(input_ids_rmpad: torch.Tensor,
+ position_ids_rmpad: Optional[torch.Tensor] = None,
+ sp_size: int = 1):
+ """
+ Pad and slice input_ids to be divisible by sp_size
+ Pad position_ids to be divisible by sp_size.
+
+ Note both input_ids_rmpad and position_ids_rmpad will be padded,
+ but only input_ids will be sliced.
+
+ The is the utility of pre-forward for ulysses sequence parallelism
+
+ Args:
+ input_ids_rmpad: shape of [bsz, seqlen]
+ position_ids_rmpad: shape of [bsz, seqlen], where bsz must be 1
+ sp_size (int): ulysses sequence parallelism size
+
+ Returns:
+ torch.Tensor: padded and sliced input_ids
+ torch.Tensor: padded and sliced position_ids
+ int: pad size
+ """
+ if position_ids_rmpad is not None:
+ assert position_ids_rmpad.size(0) == 1
+ assert input_ids_rmpad.size(1) == position_ids_rmpad.size(1)
+ if sp_size <= 1:
+ return input_ids_rmpad, position_ids_rmpad, 0
+ _, total_seq_len = input_ids_rmpad.shape
+ pad_size = (sp_size - total_seq_len % sp_size) % sp_size
+ if pad_size > 0:
+ input_ids_rmpad = torch.nn.functional.pad(input_ids_rmpad, (0, pad_size), value=0)
+ if position_ids_rmpad is not None:
+ pad_pos_ids = torch.arange(pad_size, device=position_ids_rmpad.device).unsqueeze(0)
+ position_ids_rmpad = torch.cat((position_ids_rmpad, pad_pos_ids), dim=-1)
+ # we don't need to slice position ids
+ input_ids_rmpad = slice_input_tensor(input_ids_rmpad, dim=1, padding=False)
+ return input_ids_rmpad, position_ids_rmpad, pad_size
diff --git a/code/RL_model/verl/Search-R1/verl/workers/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py b/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ba4ea39448b3b4af59f5340f75212761ca4e72
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py
@@ -0,0 +1,1054 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The main entry point to run the PPO algorithm
+"""
+
+import logging
+import os
+import warnings
+
+import torch
+import torch.distributed
+import verl.utils.hdfs_io as hdfs_io
+import verl.utils.torch_functional as verl_F
+from omegaconf import DictConfig, open_dict
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import register, Dispatch
+from verl.utils import hf_tokenizer
+from verl.utils.debug import log_gpu_memory_usage
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.fsdp_utils import get_fsdp_wrap_policy, offload_fsdp_grad, init_fn, get_init_weight_context_manager
+from verl.utils.fsdp_utils import offload_fsdp_optimizer, offload_fsdp_param_and_grad, load_fsdp_optimizer, \
+ load_fsdp_param_and_grad
+from verl.utils.import_utils import import_external_libs
+from verl.utils.model import compute_position_id_with_mask
+from verl.utils.flops_counter import FlopsCounter
+from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager
+
+from codetiming import Timer
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN'))
+
+
+class ActorRolloutRefWorker(Worker):
+ """
+ This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
+ or a hybrid engine based on the config.rollout
+ """
+
+ def __init__(self, config: DictConfig, role: str):
+ super().__init__()
+ self.config = config
+ import torch.distributed
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl")
+
+ # build device mesh for FSDP
+ world_size = torch.distributed.get_world_size()
+ from torch.distributed.device_mesh import init_device_mesh
+ # TODO(sgm): support FSDP hybrid shard for larger model
+ self.device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
+
+ # build device mesh for Ulysses Sequence Parallel
+ self.ulysses_device_mesh = None
+ self.ulysses_sequence_parallel_size = self.config.actor.get('ulysses_sequence_parallel_size', 1)
+ dp = world_size // self.ulysses_sequence_parallel_size
+ if self.ulysses_sequence_parallel_size > 1:
+ self.ulysses_device_mesh = init_device_mesh('cuda',
+ mesh_shape=(dp, self.ulysses_sequence_parallel_size),
+ mesh_dim_names=['dp', 'sp'])
+
+ self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
+
+ self.role = role
+ assert self.role in ['actor', 'rollout', 'ref', 'actor_rollout', 'actor_rollout_ref']
+
+ self._is_actor = self.role in ['actor', 'actor_rollout', 'actor_rollout_ref']
+ self._is_rollout = self.role in ['rollout', 'actor_rollout', 'actor_rollout_ref']
+ self._is_ref = self.role in ['ref', 'actor_rollout_ref']
+
+ self._is_offload_param = False
+ self._is_offload_grad = False
+ self._is_offload_optimizer = False
+ if self._is_actor:
+ self._is_offload_param = self.config.actor.fsdp_config.get('param_offload', False)
+ self._is_offload_grad = self.config.actor.fsdp_config.get('grad_offload', False)
+ self._is_offload_optimizer = self.config.actor.fsdp_config.get('optimizer_offload', False)
+ elif self._is_ref:
+ # TODO: it seems that manual offload is slowly than FSDP offload
+ self._is_offload_param = self.config.ref.fsdp_config.get('param_offload', False)
+
+ # normalize config
+ if self._is_actor:
+ self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size)
+ self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] //
+ self.ulysses_sequence_parallel_size)
+ self.config.actor.ppo_mini_batch_size *= self.config.rollout.n
+ self.config.actor.ppo_micro_batch_size *= self.config.rollout.n
+ if self._is_rollout:
+ self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
+ self.ulysses_sequence_parallel_size)
+ self.config.rollout.log_prob_micro_batch_size *= self.config.rollout.n
+ if self._is_ref:
+ self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
+ self.ulysses_sequence_parallel_size)
+ self.config.ref.log_prob_micro_batch_size *= self.config.rollout.n
+
+ def _build_model_optimizer(self,
+ model_path,
+ fsdp_config,
+ optim_config,
+ override_model_config,
+ use_remove_padding=False,
+ enable_gradient_checkpointing=False,
+ trust_remote_code=False):
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.torch_dtypes import PrecisionType
+ from transformers import AutoModelForCausalLM, AutoConfig
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision
+ from torch import optim
+
+ log_gpu_memory_usage('Before init from HF AutoModel', logger=logger)
+ local_path = copy_local_path_from_hdfs(model_path)
+
+ # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
+ # TODO(zhangchi.usc1992): 1. support create from random initialized model. 2. Support init with FSDP directly
+ self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+
+ torch_dtype = fsdp_config.get('model_dtype', None)
+ if torch_dtype is None:
+ torch_dtype = torch.float32 if self._is_actor else torch.bfloat16
+ else:
+ torch_dtype = PrecisionType.to_dtype(torch_dtype)
+
+ # override model kwargs
+ actor_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+
+ if use_remove_padding:
+ from verl.models.registry import check_model_support_rmpad
+ check_model_support_rmpad(actor_model_config.model_type)
+
+ if use_remove_padding and self.ulysses_sequence_parallel_size > 1:
+ from verl.models.transformers.monkey_patch import apply_monkey_patch
+ apply_monkey_patch(actor_model_config, verbose=True)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
+ if self.rank == 0:
+ print(f'Model config after override: {actor_model_config}')
+
+ # NOTE(fix me): tie_word_embedding causes meta_tensor init to hang
+ init_context = get_init_weight_context_manager(use_meta_tensor=not actor_model_config.tie_word_embeddings)
+
+ with init_context(), warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ actor_module = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=local_path,
+ torch_dtype=torch_dtype,
+ config=actor_model_config,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+ # some parameters may not in torch_dtype. TODO(zhangchi.usc1992) remove this after we switch to fsdp2
+ actor_module.to(torch_dtype)
+
+ if enable_gradient_checkpointing:
+ actor_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})
+ torch.distributed.barrier()
+
+ if self.rank == 0:
+ print_model_size(actor_module)
+
+ log_gpu_memory_usage('After init from HF AutoModel', logger=logger)
+
+ # We wrap FSDP for rollout as well
+ mixed_precision_config = fsdp_config.get('mixed_precision', None)
+ if mixed_precision_config is not None:
+ param_dtype = PrecisionType.to_dtype(mixed_precision_config.get('param_dtype', 'bf16'))
+ reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get('reduce_dtype', 'fp32'))
+ buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get('buffer_dtype', 'fp32'))
+ else:
+ param_dtype = torch.bfloat16
+ reduce_dtype = torch.float32
+ buffer_dtype = torch.float32
+
+ mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype)
+
+ if self._is_ref:
+ mixed_precision = None
+
+ auto_wrap_policy = get_fsdp_wrap_policy(module=actor_module, config=fsdp_config.get('wrap_policy', None))
+
+ if self._is_rollout and self.config.rollout.name == 'hf':
+ # TODO(zhangchi.usc1992, shengguangming) fix me. Current, auto_wrap_policy causes HFRollout to hang in Gemma
+ auto_wrap_policy = None
+
+ print(f'wrap_policy: {auto_wrap_policy}')
+
+ # TODO(sgm): support hybrid
+ if auto_wrap_policy is None:
+ sharding_strategy = ShardingStrategy.SHARD_GRAD_OP
+ else:
+ sharding_strategy = ShardingStrategy.FULL_SHARD
+
+ # TODO: add transformer policy
+ actor_module_fsdp = FSDP(
+ actor_module,
+ param_init_fn=init_fn,
+ use_orig_params=False,
+ auto_wrap_policy=auto_wrap_policy,
+ device_id=torch.cuda.current_device(),
+ sharding_strategy=sharding_strategy, # zero3
+ mixed_precision=mixed_precision,
+ sync_module_states=True,
+ device_mesh=self.device_mesh,
+ forward_prefetch=False)
+
+ log_gpu_memory_usage('After Actor FSDP init', logger=logger)
+
+ # TODO: add more optimizer args into config
+ if self._is_actor:
+ from verl.utils.torch_functional import get_constant_schedule_with_warmup
+ actor_optimizer = optim.AdamW(actor_module_fsdp.parameters(),
+ lr=optim_config.lr,
+ betas=optim_config.get('betas', (0.9, 0.999)),
+ weight_decay=optim_config.get('weight_decay', 1e-2))
+
+ total_steps = optim_config.get('total_training_steps', 0)
+ num_warmup_steps_ratio = optim_config.get('lr_warmup_steps_ratio', 0.)
+ num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
+
+ print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}')
+
+ actor_lr_scheduler = get_constant_schedule_with_warmup(optimizer=actor_optimizer,
+ num_warmup_steps=num_warmup_steps)
+ else:
+ actor_optimizer = None
+ actor_lr_scheduler = None
+
+ log_gpu_memory_usage('After actor optimizer init', logger=logger)
+
+ return actor_module_fsdp, actor_optimizer, actor_lr_scheduler, actor_model_config
+
+ def _build_rollout(self):
+ from torch.distributed.device_mesh import init_device_mesh
+ # TODO(sgm): support FSDP hybrid shard for larger model
+ infer_tp = self.config.rollout.tensor_model_parallel_size
+ dp = self.world_size // infer_tp
+ assert self.world_size % infer_tp == 0, f'rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}'
+ rollout_device_mesh = init_device_mesh('cuda', mesh_shape=(dp, infer_tp), mesh_dim_names=['dp', 'infer_tp'])
+
+ if self.config.rollout.name == 'hf':
+ from verl.workers.rollout import HFRollout
+ from verl.workers.sharding_manager import BaseShardingManager
+ rollout = HFRollout(module=self.actor_module_fsdp, config=self.config.rollout)
+ rollout_sharding_manager = BaseShardingManager()
+ # TODO: a sharding manager that do nothing?
+ elif self.config.rollout.name == 'vllm':
+ from verl.workers.rollout.vllm_rollout import vLLMRollout
+ from verl.workers.sharding_manager import FSDPVLLMShardingManager
+ log_gpu_memory_usage('Before building vllm rollout', logger=None)
+ rollout = vLLMRollout(actor_module=self.actor_module_fsdp,
+ config=self.config.rollout,
+ tokenizer=self.tokenizer,
+ model_hf_config=self.actor_model_config)
+ log_gpu_memory_usage('After building vllm rollout', logger=None)
+ if torch.distributed.get_world_size() == 1:
+ self.config.rollout.load_format = 'dummy_hf'
+ rollout_sharding_manager = FSDPVLLMShardingManager(module=self.actor_module_fsdp,
+ inference_engine=rollout.inference_engine,
+ model_config=self.actor_model_config,
+ full_params='hf' in self.config.rollout.load_format,
+ device_mesh=rollout_device_mesh)
+ log_gpu_memory_usage('After building sharding manager', logger=None)
+
+ return rollout, rollout_sharding_manager
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ from verl.workers.actor import DataParallelPPOActor
+ # This is used to import external_lib into the huggingface systems
+ import_external_libs(self.config.model.get('external_lib', None))
+
+ from omegaconf import OmegaConf
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+
+ use_remove_padding = self.config.model.get('use_remove_padding', False)
+
+ if self._is_actor or self._is_rollout:
+ # we need the model for actor and rollout
+ if self._is_actor:
+ optim_config = self.config.actor.optim
+ fsdp_config = self.config.actor.fsdp_config
+ else:
+ optim_config = None
+ fsdp_config = OmegaConf.create()
+ self.actor_module_fsdp, self.actor_optimizer, self.actor_lr_scheduler, self.actor_model_config = self._build_model_optimizer(
+ model_path=self.config.model.path,
+ fsdp_config=fsdp_config,
+ optim_config=optim_config,
+ override_model_config=override_model_config,
+ use_remove_padding=use_remove_padding,
+ enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False),
+ trust_remote_code=self.config.model.get('trust_remote_code', False))
+
+ # get the original unwrapped module
+ self.actor_module = self.actor_module_fsdp._fsdp_wrapped_module
+
+ if self._is_offload_param:
+ # param is require during state_dict in sharding manager
+ offload_fsdp_grad(module=self.actor_module_fsdp)
+ log_gpu_memory_usage('After offload actor grad during init', logger=logger)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.actor_optimizer)
+ log_gpu_memory_usage('After offload actor optimizer during init', logger=logger)
+ # load from checkpoint
+ if self._is_actor:
+ OmegaConf.set_struct(self.config.actor, True)
+ with open_dict(self.config.actor):
+ self.config.actor.use_remove_padding = use_remove_padding
+ self.actor = DataParallelPPOActor(config=self.config.actor,
+ actor_module=self.actor_module_fsdp,
+ actor_optimizer=self.actor_optimizer)
+
+ if self._is_rollout:
+ self.rollout, self.rollout_sharding_manager = self._build_rollout()
+
+ if self._is_ref:
+ self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path,
+ fsdp_config=self.config.ref.fsdp_config,
+ optim_config=None,
+ override_model_config=override_model_config,
+ use_remove_padding=use_remove_padding,
+ trust_remote_code=self.config.model.get(
+ 'trust_remote_code', False))[0]
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad)
+
+ OmegaConf.set_struct(self.config.ref, True)
+ with open_dict(self.config.ref):
+ self.config.ref.use_remove_padding = use_remove_padding
+ self.ref_policy = DataParallelPPOActor(config=self.config.ref, actor_module=self.ref_module_fsdp)
+
+ if self._is_actor:
+ self.flops_counter = FlopsCounter(self.actor_model_config)
+
+ torch.cuda.empty_cache()
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def update_actor(self, data: DataProto):
+ data = data.to('cuda')
+
+ assert self._is_actor
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ load_fsdp_optimizer(optimizer=self.actor_optimizer, device_id=torch.cuda.current_device())
+
+ data.batch = data.batch.cuda()
+
+ log_gpu_memory_usage('Before update policy', logger=logger)
+
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+ # perform training
+ with Timer(name='update_policy', logger=None) as timer:
+ metrics = self.actor.update_policy(data=data)
+ delta_time = timer.last
+ global_num_tokens = data.meta_info['global_token_num']
+ estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
+ metrics['mfu/actor'] = estimated_flops * self.config.actor.ppo_epochs / promised_flops / self.world_size
+
+ self.actor_lr_scheduler.step()
+ lr = self.actor_lr_scheduler.get_last_lr()[0]
+ metrics['actor/lr'] = lr
+
+ log_gpu_memory_usage('After update policy', logger=logger)
+
+ # TODO: here, we should return all metrics
+ output = DataProto(meta_info={'metrics': metrics})
+
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.actor_optimizer)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_log_prob(self, data: DataProto) -> DataProto:
+ """mostly copying from generate_sequences"""
+ data = data.to('cuda')
+
+ assert self._is_rollout
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ data.batch = data.batch.cuda()
+ meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id}
+ data.meta_info.update(meta_info)
+
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data)
+ old_log_probs = self.actor.compute_log_prob(data=data)
+ output = DataProto.from_dict(tensors={'old_log_probs': old_log_probs})
+ output = self.ulysses_sharding_manager.postprocess_data(output)
+
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ # NOTE(sgm): the grad is already in CPU, only offload param here
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+ # clear kv cache
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After recompute log prob', logger=logger)
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def generate_sequences(self, prompts: DataProto):
+ prompts = prompts.to('cuda')
+ # set to False if it is validation
+ recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True)
+
+ assert self._is_rollout
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ prompts.batch = prompts.batch.cuda()
+ meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id}
+ prompts.meta_info.update(meta_info)
+ with self.rollout_sharding_manager:
+ log_gpu_memory_usage('After entering rollout sharding manager', logger=logger)
+
+ prompts = self.rollout_sharding_manager.preprocess_data(prompts)
+ output = self.rollout.generate_sequences(prompts=prompts)
+
+ log_gpu_memory_usage('After rollout generation', logger=logger)
+
+ output = self.rollout_sharding_manager.postprocess_data(output)
+
+ if self._is_actor and recompute_log_prob:
+ # we should always recompute old_log_probs when it is HybridEngine
+ output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size
+ output.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu
+ output.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz
+ output.meta_info['temperature'] = self.config.rollout.temperature
+ # perform recompute log_prob
+ with self.ulysses_sharding_manager:
+ output = self.ulysses_sharding_manager.preprocess_data(output)
+ old_log_probs = self.actor.compute_log_prob(data=output)
+ output.batch['old_log_probs'] = old_log_probs
+ output = self.ulysses_sharding_manager.postprocess_data(output)
+
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ # NOTE(sgm): the grad is already in CPU, only offload param here
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+ # clear kv cache
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After recompute log prob', logger=logger)
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_ref_log_prob(self, data: DataProto):
+ assert self._is_ref
+
+ data = data.to('cuda')
+
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.ref_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ micro_batch_size = self.config.ref.log_prob_micro_batch_size
+ data.meta_info['micro_batch_size'] = micro_batch_size
+ data.meta_info['temperature'] = self.config.rollout.temperature
+ data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu
+ data.meta_info['use_dynamic_bsz'] = self.config.ref.log_prob_use_dynamic_bsz
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data)
+ output = self.ref_policy.compute_log_prob(data=data)
+ output = DataProto.from_dict(tensors={'ref_log_prob': output})
+ output = self.ulysses_sharding_manager.postprocess_data(output)
+
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, local_path, hdfs_path=None):
+ assert self._is_actor
+ import torch
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ # TODO: support DCP and save sharded checkpoints
+ import torch.distributed
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig
+ cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+ with FSDP.state_dict_type(self.actor.actor_module, StateDictType.FULL_STATE_DICT, cfg):
+ state_dict = self.actor.actor_module.state_dict()
+ if self.rank == 0:
+ print(f'Saving actor checkpoint to {local_path}')
+ os.makedirs(local_path, exist_ok=True)
+ self.actor_module.save_pretrained(local_path, state_dict=state_dict)
+ self.tokenizer.save_pretrained(local_path)
+ if hdfs_path is not None:
+ print(f'Uploading actor checkpoint to {hdfs_path}')
+ hdfs_io.makedirs(hdfs_path, exist_ok=True)
+ hdfs_io.copy(src=local_path, dst=hdfs_path)
+
+ torch.distributed.barrier()
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+
+
+class CriticWorker(Worker):
+
+ def __init__(self, config):
+ super().__init__()
+ import torch.distributed
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl")
+ self.config = config
+
+ # build device mesh for Ulysses Sequence Parallel
+ world_size = torch.distributed.get_world_size()
+ from torch.distributed.device_mesh import init_device_mesh
+ self.ulysses_device_mesh = None
+ self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
+ dp = world_size // self.ulysses_sequence_parallel_size
+ if self.ulysses_sequence_parallel_size > 1:
+ self.ulysses_device_mesh = init_device_mesh('cuda',
+ mesh_shape=(dp, self.ulysses_sequence_parallel_size),
+ mesh_dim_names=['dp', 'sp'])
+
+ self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
+
+ # set FSDP offload params
+ self._is_offload_param = self.config.model.fsdp_config.param_offload
+ self._is_offload_grad = self.config.model.fsdp_config.grad_offload
+ self._is_offload_optimizer = self.config.model.fsdp_config.optimizer_offload
+
+ # normalize config
+ self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
+ self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
+ self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() //
+ self.ulysses_sequence_parallel_size)
+
+ def _build_critic_model_optimizer(self, config):
+ # the following line is necessary
+ from verl.utils.model import LambdaLayer, print_model_size, squeeze
+ from verl.utils.torch_dtypes import PrecisionType
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision
+ from torch import optim
+
+ local_path = copy_local_path_from_hdfs(config.model.path)
+ # note that the tokenizer between actor and critic may be different. So override tokenizer info with actor info
+ # using random initialized model from any architecture. May not be the same as Actor.
+
+ tokenizer_path = copy_local_path_from_hdfs(config.model.tokenizer_path)
+ self.tokenizer = hf_tokenizer(tokenizer_path, trust_remote_code=config.model.get('trust_remote_code', False))
+
+ from omegaconf import OmegaConf
+ override_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_config)
+ if self.rank == 0:
+ print(f'Critic overriding config {override_config_kwargs}')
+
+ torch_dtype = self.config.model.fsdp_config.get('model_dtype', 'fp32')
+ torch_dtype = PrecisionType.to_dtype(torch_dtype)
+
+ from transformers import AutoConfig, AutoModelForTokenClassification
+ from torch import nn
+
+ trust_remote_code = False
+ critic_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+ critic_model_config.num_labels = 1
+
+ use_remove_padding = config.model.get('use_remove_padding', False)
+ if use_remove_padding:
+ from verl.models.registry import check_model_support_rmpad
+ check_model_support_rmpad(critic_model_config.model_type)
+
+ if use_remove_padding and self.ulysses_sequence_parallel_size > 1:
+ from verl.models.transformers.monkey_patch import apply_monkey_patch
+ apply_monkey_patch(critic_model_config, verbose=True)
+
+ init_context = get_init_weight_context_manager()
+ with init_context(), warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ setattr(critic_model_config, 'classifier_dropout', 0.)
+ setattr(critic_model_config, 'hidden_dropout', '0')
+ critic_module = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=local_path,
+ torch_dtype=torch_dtype,
+ config=critic_model_config,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+
+ # some parameters may not in torch_dtype
+ critic_module.to(torch_dtype)
+
+ if config.model.get('enable_gradient_checkpointing', False):
+ critic_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})
+ if self.rank == 0:
+ print_model_size(critic_module)
+
+ self.critic_model_config = critic_model_config
+
+ fsdp_config = self.config.model.fsdp_config
+ mixed_precision_config = fsdp_config.get('mixed_precision', None)
+ if mixed_precision_config is not None:
+ param_dtype = PrecisionType.to_dtype(mixed_precision_config.get('param_dtype', 'bf16'))
+ reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get('reduce_dtype', 'fp32'))
+ buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get('buffer_dtype', 'fp32'))
+ else:
+ param_dtype = torch.bfloat16
+ reduce_dtype = torch.float32
+ buffer_dtype = torch.float32
+
+ mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype)
+
+ auto_wrap_policy = get_fsdp_wrap_policy(module=critic_module, config=self.config.model.fsdp_config.wrap_policy)
+
+ log_gpu_memory_usage('Before critic FSDP', logger=None)
+
+ critic_module = FSDP(critic_module,
+ param_init_fn=init_fn,
+ use_orig_params=False,
+ auto_wrap_policy=auto_wrap_policy,
+ device_id=torch.cuda.current_device(),
+ sharding_strategy=ShardingStrategy.FULL_SHARD,
+ mixed_precision=mixed_precision,
+ sync_module_states=True,
+ forward_prefetch=False)
+
+ log_gpu_memory_usage('After critic FSDP', logger=None)
+
+ critic_optimizer = optim.AdamW(critic_module.parameters(),
+ lr=config.optim.lr,
+ betas=config.optim.get('betas', (0.9, 0.999)),
+ weight_decay=config.optim.get('weight_decay', 1e-2))
+
+ total_steps = config.optim.get('total_training_steps', 0)
+ num_warmup_steps_ratio = config.optim.get('lr_warmup_steps_ratio', 0.)
+ num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
+
+ print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}')
+
+ from verl.utils.torch_functional import get_constant_schedule_with_warmup
+ critic_lr_scheduler = get_constant_schedule_with_warmup(optimizer=critic_optimizer,
+ num_warmup_steps=num_warmup_steps)
+
+ return critic_module, critic_optimizer, critic_lr_scheduler
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # This is used to import external_lib into the huggingface systems
+ import_external_libs(self.config.model.get('external_lib', None))
+
+ from verl.workers.critic import DataParallelPPOCritic
+ self.critic_module, self.critic_optimizer, self.critic_lr_scheduler = self._build_critic_model_optimizer(
+ self.config)
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.critic_optimizer)
+
+ self.critic = DataParallelPPOCritic(config=self.config,
+ critic_module=self.critic_module,
+ critic_optimizer=self.critic_optimizer)
+
+ self.flops_counter = FlopsCounter(self.critic_model_config)
+
+ torch.cuda.empty_cache()
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_values(self, data: DataProto):
+ data = data.to('cuda')
+
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.critic_module,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+ micro_batch_size = self.config.forward_micro_batch_size
+ data.meta_info['micro_batch_size'] = micro_batch_size
+ data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu
+ data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz
+ # perform forward computation
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+ values = self.critic.compute_values(data=data)
+ output = DataProto.from_dict(tensors={'values': values})
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+
+ output = output.to('cpu')
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def update_critic(self, data: DataProto):
+ data = data.to('cuda')
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.critic_module,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ load_fsdp_optimizer(optimizer=self.critic_optimizer, device_id=torch.cuda.current_device())
+
+ # perform forward computation
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+
+ with Timer(name='update_critic', logger=None) as timer:
+ metrics = self.critic.update_critic(data=data)
+ delta_time = timer.last
+
+ global_num_tokens = data.meta_info['global_token_num']
+ estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
+ metrics['mfu/critic'] = estimated_flops * self.config.ppo_epochs / promised_flops / self.world_size
+
+ self.critic_lr_scheduler.step()
+ lr = self.critic_lr_scheduler.get_last_lr()[0]
+ metrics['critic/lr'] = lr
+
+ output = DataProto(batch=None, meta_info={'metrics': metrics})
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.critic_optimizer)
+ torch.cuda.empty_cache()
+ output = output.to('cpu')
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, local_path, hdfs_path=None):
+ import torch
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.critic_module,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ # TODO: support DCP and save sharded checkpoints
+ import torch.distributed
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig
+ cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+ with FSDP.state_dict_type(self.critic_module, StateDictType.FULL_STATE_DICT, cfg):
+ state_dict = self.critic_module.state_dict()
+ if self.rank == 0:
+ print(f'Saving critic checkpoint to {local_path}')
+ os.makedirs(local_path, exist_ok=True)
+ self.critic_module._fsdp_wrapped_module.save_pretrained(local_path, state_dict=state_dict)
+ self.tokenizer.save_pretrained(local_path)
+ if hdfs_path is not None:
+ print(f'Uploading critic checkpoint to {hdfs_path}')
+ hdfs_io.makedirs(hdfs_path, exist_ok=True)
+ hdfs_io.copy(src=local_path, dst=hdfs_path)
+
+ torch.distributed.barrier()
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+
+
+# TODO(sgm): we may need to extract it to dp_reward_model.py
+class RewardModelWorker(Worker):
+ """
+ Note that we only implement the reward model that is subclass of AutoModelForTokenClassification.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ import torch.distributed
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl")
+ self.config = config
+
+ # build device mesh for Ulysses Sequence Parallel
+ world_size = torch.distributed.get_world_size()
+ from torch.distributed.device_mesh import init_device_mesh
+ self.ulysses_device_mesh = None
+ self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
+ dp = world_size // self.ulysses_sequence_parallel_size
+ if self.ulysses_sequence_parallel_size > 1:
+ self.ulysses_device_mesh = init_device_mesh('cuda',
+ mesh_shape=(dp, self.ulysses_sequence_parallel_size),
+ mesh_dim_names=['dp', 'sp'])
+
+ self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
+
+ self.use_remove_padding = self.config.model.get('use_remove_padding', False)
+ self.config.micro_batch_size //= torch.distributed.get_world_size()
+
+ def _build_model(self, config):
+ # the following line is necessary
+ from transformers import AutoModelForTokenClassification, AutoConfig
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, CPUOffload
+
+ # download the checkpoint from hdfs
+ local_path = copy_local_path_from_hdfs(config.model.path)
+
+ if self.config.model.input_tokenizer is None:
+ self._do_switch_chat_template = False
+ else:
+ self._do_switch_chat_template = True
+ input_tokenizer_local_path = copy_local_path_from_hdfs(config.model.input_tokenizer)
+ self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path,
+ trust_remote_code=config.model.get('trust_remote_code', False))
+ self.tokenizer = hf_tokenizer(local_path, trust_remote_code=config.model.get('trust_remote_code', False))
+
+ trust_remote_code = config.model.get('trust_remote_code', False)
+ model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+ model_config.num_labels = 1
+
+ use_remove_padding = config.model.get('use_remove_padding', False)
+ if use_remove_padding:
+ from verl.models.registry import check_model_support_rmpad
+ check_model_support_rmpad(model_config.model_type)
+
+ if use_remove_padding and self.ulysses_sequence_parallel_size > 1:
+ from verl.models.transformers.monkey_patch import apply_monkey_patch
+ apply_monkey_patch(model_config, verbose=True)
+
+ # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
+ init_context = get_init_weight_context_manager(use_meta_tensor=not model_config.tie_word_embeddings)
+
+ with init_context(), warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ setattr(model_config, 'classifier_dropout', 0.)
+ reward_module = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=local_path,
+ config=model_config,
+ torch_dtype=torch.bfloat16,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+ reward_module.to(torch.bfloat16)
+ auto_wrap_policy = get_fsdp_wrap_policy(module=reward_module, config=self.config.model.fsdp_config)
+
+ reward_module = FSDP(
+ reward_module,
+ param_init_fn=init_fn,
+ use_orig_params=False,
+ auto_wrap_policy=auto_wrap_policy,
+ device_id=torch.cuda.current_device(),
+ sharding_strategy=ShardingStrategy.FULL_SHARD, # zero3
+ sync_module_states=True,
+ cpu_offload=CPUOffload(offload_params=self.config.model.fsdp_config.param_offload),
+ forward_prefetch=False)
+
+ return reward_module
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # This is used to import external_lib into the huggingface systems
+ import_external_libs(self.config.model.get('external_lib', None))
+ self.reward_module = self._build_model(config=self.config)
+ torch.cuda.empty_cache()
+
+ def _forward_micro_batch(self, micro_batch):
+ from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis, rearrange
+ from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad
+
+ with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+ input_ids = micro_batch['input_ids']
+ batch_size, seqlen = input_ids.shape
+ attention_mask = micro_batch['attention_mask']
+ position_ids = micro_batch['position_ids']
+
+ if self.use_remove_padding:
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+ attention_mask) # input_ids_rmpad (total_nnz, ...)
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
+
+ # unpad the position_ids to align the rotary
+ position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+ indices).transpose(0, 1)
+
+ # pad and slice the inputs if sp > 1
+ if self.ulysses_sequence_parallel_size > 1:
+ input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs(input_ids_rmpad, \
+ position_ids_rmpad, \
+ sp_size=self.ulysses_sequence_parallel_size)
+
+ # only pass input_ids and position_ids to enable flash_attn_varlen
+ output = self.reward_module(input_ids=input_ids_rmpad,
+ attention_mask=None,
+ position_ids=position_ids_rmpad,
+ use_cache=False) # prevent model thinks we are generating
+ reward_rmpad = output.logits
+ reward_rmpad = reward_rmpad.squeeze(0) # (total_nnz)
+
+ # gather output if sp > 1
+ if self.ulysses_sequence_parallel_size > 1:
+ reward_rmpad = gather_outpus_and_unpad(reward_rmpad,
+ gather_dim=0,
+ unpad_dim=0,
+ padding_size=pad_size)
+
+ # pad it back
+ rm_score = pad_input(reward_rmpad, indices=indices, batch=batch_size, seqlen=seqlen).squeeze(-1)
+ else:
+ output = self.reward_module(input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids)
+ rm_score = output.logits # (batch_size, seq_len, 1)
+ rm_score = rm_score.squeeze(-1)
+
+ # extract the result of the last valid token
+ eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bsz,)
+ rm_score = rm_score[torch.arange(batch_size), eos_mask_idx]
+ return rm_score
+
+ def _expand_to_token_level(self, data: DataProto, scores: torch.Tensor):
+ batch_size = data.batch.batch_size[0]
+ # expand as token_level_reward
+ attention_mask = data.batch['attention_mask']
+ position_ids = data.batch['position_ids']
+ response_length = data.batch['responses'].shape[-1]
+ eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bsz,)
+ token_level_scores = torch.zeros_like(attention_mask, dtype=scores.dtype) # (bsz, seqlen)
+ token_level_scores[torch.arange(batch_size), eos_mask_idx] = scores
+
+ # select the response part
+ token_level_scores = token_level_scores[:, -response_length:]
+
+ return token_level_scores
+
+ def _switch_chat_template(self, data: DataProto):
+ src_max_length = data.batch['attention_mask'].shape[-1]
+
+ src_tokenizer = self.input_tokenizer
+ target_tokenizer = self.tokenizer
+
+ rm_input_ids = []
+ rm_attention_mask = []
+
+ for i in range(data.batch.batch_size[0]):
+ # extract raw prompt
+ chat: list = data.non_tensor_batch['raw_prompt'][i].tolist()
+
+ # extract response
+ response_ids = data.batch['responses'][i]
+ response_length = response_ids.shape[-1]
+ valid_response_length = data.batch['attention_mask'][i][-response_length:].sum()
+ valid_response_ids = response_ids[:valid_response_length]
+
+ # decode
+ response = src_tokenizer.decode(valid_response_ids)
+ # remove bos and eos
+ response = response.replace(src_tokenizer.eos_token, '')
+
+ chat.append({'role': 'assistant', 'content': response})
+
+ prompt_with_chat_template = target_tokenizer.apply_chat_template(chat,
+ add_generation_prompt=False,
+ tokenize=False)
+ if self.rank == 0 and i == 0:
+ # for debugging purpose
+ print(f'Switch template. chat: {prompt_with_chat_template}')
+
+ # the maximum length is actually determined by the reward model itself
+ max_length = self.config.get('max_length', src_max_length)
+ if max_length is None:
+ max_length = src_max_length
+ input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(
+ prompt=prompt_with_chat_template,
+ tokenizer=target_tokenizer,
+ max_length=max_length,
+ pad_token_id=target_tokenizer.pad_token_id,
+ left_pad=False, # right padding
+ truncation=self.config.get('truncation', 'right')) # truncate from the right
+
+ rm_input_ids.append(input_ids)
+ rm_attention_mask.append(attention_mask)
+
+ rm_input_ids = torch.cat(rm_input_ids, dim=0)
+ rm_attention_mask = torch.cat(rm_attention_mask, dim=0)
+
+ rm_position_ids = compute_position_id_with_mask(rm_attention_mask)
+
+ rm_inputs = {'input_ids': rm_input_ids, 'attention_mask': rm_attention_mask, 'position_ids': rm_position_ids}
+
+ return DataProto.from_dict(rm_inputs)
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_rm_score(self, data: DataProto):
+ import itertools
+ from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx
+ data = data.to('cuda')
+ if self._do_switch_chat_template:
+ rm_data = self._switch_chat_template(data)
+
+ rm_data.batch = rm_data.batch.cuda()
+
+ # perform forward computation
+ with self.ulysses_sharding_manager:
+ rm_data = self.ulysses_sharding_manager.preprocess_data(data=rm_data)
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+
+ use_dynamic_bsz = self.config.use_dynamic_bsz
+ if use_dynamic_bsz:
+ max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
+ micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len)
+ else:
+ micro_batches = rm_data.batch.split(self.config.micro_batch_size)
+ output = []
+ for micro_batch in micro_batches:
+ rm_score = self._forward_micro_batch(micro_batch)
+ output.append(rm_score)
+ scores = torch.cat(output, dim=0) # (batch_size)
+
+ if use_dynamic_bsz:
+ indices = list(itertools.chain.from_iterable(indices))
+ assert len(indices) == scores.size(0), f"{len(indices)} vs. {scores.size()}"
+ revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
+ scores = scores[revert_indices]
+
+ token_level_scores = self._expand_to_token_level(data, scores)
+ # Note that this is only the scores, may not be the final rewards used to train RL
+ output = DataProto.from_dict(tensors={'rm_scores': token_level_scores})
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+
+ output = output.to('cpu')
+ torch.cuda.empty_cache()
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py b/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1143b7baa9ed1f15a9660fe892e77a57155b399e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py
@@ -0,0 +1,735 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The main entry point to run the PPO algorithm
+"""
+
+import os
+import logging
+import ray
+import torch
+import torch.distributed
+import torch.nn as nn
+from omegaconf import DictConfig
+from verl.single_controller.base.megatron.worker import MegatronWorker
+from verl.workers.actor.megatron_actor import MegatronPPOActor
+from verl.workers.critic.megatron_critic import MegatronPPOCritic
+from verl.workers.sharding_manager import AllGatherPPModel
+from verl.workers.reward_model.megatron.reward_model import MegatronRewardModel
+
+from verl.single_controller.base.decorator import register, Dispatch
+from verl import DataProto
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.debug import log_gpu_memory_usage
+from verl.utils.model import load_megatron_model_weights
+from verl.utils.megatron_utils import init_model_parallel_config
+from verl.utils.megatron_utils import offload_megatron_param_and_grad, load_megatron_param_and_grad
+from verl.utils import hf_tokenizer
+
+from megatron.core import parallel_state as mpu
+from megatron.core import ModelParallelConfig
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN'))
+
+
+def set_random_seed(seed):
+ import torch
+ import numpy as np
+ import random
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+ random.seed(seed)
+ if torch.cuda.device_count() > 0:
+ from megatron.core import tensor_parallel
+ tensor_parallel.model_parallel_cuda_manual_seed(seed)
+ # FIXME: torch cumsum not support deterministic (used in vllm sampler),
+ # https://github.com/pytorch/pytorch/issues/89492
+ # torch.use_deterministic_algorithms(True, warn_only=True)
+ # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+
+
+class ActorRolloutRefWorker(MegatronWorker):
+ """
+ This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
+ or a hybrid engine based on the config.rollout
+ """
+
+ def __init__(self, config: DictConfig, role: str):
+ super().__init__()
+ self.config = config
+
+ # NOTE(sgm): We utilize colocate WorkerGroup by default.
+ # As a result, Workers for different model share the same process.
+ # Therefore, we only require one distribute initialization.
+ # To utilize different parallel startegy in different models:
+ # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models,
+ # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385
+ if not torch.distributed.is_initialized():
+ rank = int(os.environ['LOCAL_RANK'])
+ torch.distributed.init_process_group(backend="nccl")
+ torch.cuda.set_device(rank)
+
+ if self.config.actor.megatron.sequence_parallel:
+ os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+ mpu.initialize_model_parallel(
+ tensor_model_parallel_size=self.config.actor.megatron.tensor_model_parallel_size,
+ pipeline_model_parallel_size=self.config.actor.megatron.pipeline_model_parallel_size,
+ virtual_pipeline_model_parallel_size=None,
+ pipeline_model_parallel_split_rank=None,
+ use_sharp=False,
+ context_parallel_size=1,
+ expert_model_parallel_size=1,
+ nccl_communicator_config_path=None,
+ )
+
+ set_random_seed(seed=self.config.actor.megatron.seed)
+
+ self.role = role
+ assert self.role in ['actor', 'rollout', 'ref', 'actor_rollout', 'actor_rollout_ref']
+
+ self._is_actor = self.role in ['actor', 'actor_rollout', 'actor_rollout_ref']
+ self._is_rollout = self.role in ['rollout', 'actor_rollout', 'actor_rollout_ref']
+ self._is_ref = self.role in ['ref', 'actor_rollout_ref']
+
+ # TODO(sgm): Currently, we only support reference model param offload
+ # will support other offload later
+ self._is_offload_param = False
+ self._is_offload_grad = False
+ self._is_offload_optimizer = False
+
+ # normalize config
+ if self._is_actor and self._is_rollout:
+ self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
+ self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
+ self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
+ self._is_offload_param = self.config.actor.get('param_offload', False)
+ self._is_offload_grad = self.config.actor.get('grad_offload', False)
+ self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False)
+ elif self._is_ref:
+ self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
+ self._is_offload_param = self.config.ref.get('param_offload', False)
+
+ def _build_model_optimizer(self,
+ model_path,
+ megatron_config: ModelParallelConfig,
+ optim_config,
+ override_model_config,
+ enable_gradient_checkpointing=False):
+ from verl.utils.megatron.optimizer import get_megatron_optimizer
+ from megatron.core.models.gpt.gpt_model import ModelType
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.megatron_utils import get_model, init_megatron_optim_config
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ # Step 1: initialize the tokenizer
+ local_path = copy_local_path_from_hdfs(model_path)
+ self.tokenizer = hf_tokenizer(local_path)
+
+ # Step 2: get the actor_model_config
+ actor_model_config = AutoConfig.from_pretrained(local_path)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
+
+ if self.rank == 0:
+ print(f'Model config after override: {actor_model_config}')
+
+ def megatron_actor_model_provider(pre_process, post_process):
+ from verl.utils.model import get_parallel_model_from_config
+ # vpp is not supported yet because it will hang for some reason. Need debugging
+ vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
+ # this_megatron_config = copy.deepcopy(megatron_config)
+ # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+ parallel_model = get_parallel_model_from_config(config=actor_model_config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process,
+ value=False)
+ parallel_model.cuda()
+ return parallel_model
+
+ # Step 3: initialize the megatron model
+ if self._is_actor and self._is_rollout:
+ # Initialize the 3D HybridEngine
+ hybrid_engine = AllGatherPPModel(model_provider=megatron_actor_model_provider)
+ # Fetch the model at current rank
+ actor_module = hybrid_engine.this_rank_models
+ if isinstance(actor_module, nn.ModuleList):
+ actor_module = [actor_module[0]]
+ if self.config.actor.load_weight:
+ load_megatron_model_weights(self.config,
+ actor_model_config,
+ actor_module,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=False)
+
+ if self.rank == 0:
+ print_model_size(actor_module[0])
+ log_gpu_memory_usage('After AllGatherPPModel init', logger=logger)
+ elif self._is_ref:
+ print(f'self.config.ref.load_weight: {self.config.ref.load_weight}')
+ ref_module = get_model(model_provider_func=megatron_actor_model_provider,
+ model_type=ModelType.encoder_or_decoder,
+ wrap_with_ddp=False)
+ # ref_module = nn.ModuleList(ref_module)
+
+ if self.config.ref.load_weight: # should align with the actor:
+ assert self.config.actor.load_weight == self.config.ref.load_weight
+ print(f'load ref weight start')
+ load_megatron_model_weights(self.config,
+ actor_model_config,
+ ref_module,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=False)
+ log_gpu_memory_usage('After ref module init', logger=logger)
+ return ref_module, actor_model_config
+
+ # TODO: add more optimizer args into config
+ if self._is_actor:
+ optim_config = init_megatron_optim_config(optim_config)
+ actor_optimizer = get_megatron_optimizer(model=actor_module, config=optim_config)
+ else:
+ optim_config = None
+ actor_optimizer = None
+
+ log_gpu_memory_usage('After actor optimizer init', logger=logger)
+
+ return actor_module, hybrid_engine, actor_optimizer, actor_model_config, optim_config
+
+ def _build_rollout(self):
+ if self.config.rollout.name == 'vllm':
+ from verl.workers.rollout.vllm_rollout import vLLMRollout
+ from verl.workers.sharding_manager import MegatronVLLMShardingManager
+ from verl.utils.model import normalize_pp_vpp_params
+
+ # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor,
+ # we will reorganize their weight format when resharding from actor to rollout.
+ layer_name_mapping = {
+ "qkv_layer_name":
+ self.config.rollout.layer_name_map.get("qkv_layer_name", "qkv"),
+ "gate_proj_layer_name":
+ self.config.rollout.layer_name_map.get("gate_proj_layer_name", "linear_fc1.weight"),
+ }
+
+ # reshard the weight partition from actor to rollout to initialize the rollout class
+ # create a new cuda space for parameters not in this pp rank
+ self.hybrid_engine.load_params_to_cuda()
+ # broadcast the parameters from pp rank to other ranks
+ self.hybrid_engine.allgather_params()
+ # obtain name to parameters in pp/vpp
+ params = self.hybrid_engine.get_all_params()
+ # update the param name for the
+ params = normalize_pp_vpp_params(params=params,
+ num_hidden_layers=self.actor_model_config.num_hidden_layers,
+ layer_name='layers')
+ rollout = vLLMRollout(actor_module=params,
+ config=self.config.rollout,
+ tokenizer=self.tokenizer,
+ model_hf_config=self.actor_model_config,
+ train_tp=mpu.get_tensor_model_parallel_world_size())
+ log_gpu_memory_usage('After building vllm rollout', logger=logger)
+
+ # perform weight resharding between actor and rollout
+ sharding_manager = MegatronVLLMShardingManager(module=self.hybrid_engine,
+ inference_engine=rollout.inference_engine,
+ model_config=self.actor_model_config,
+ layer_name_mapping=layer_name_mapping)
+ log_gpu_memory_usage('After building sharding manager', logger=logger)
+ else:
+ NotImplementedError('Only vllmRollout is supported with Megatron now')
+
+ return rollout, sharding_manager
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+
+ from omegaconf import OmegaConf
+ from verl.utils.torch_dtypes import PrecisionType
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+ torch_dtype = torch.bfloat16
+
+ megatron_config = OmegaConf.create({
+ 'sequence_parallel': self.config.actor.megatron.get('sequence_parallel', True),
+ 'param_dtype': PrecisionType.to_str(torch_dtype),
+ 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),
+ 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),
+ 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),
+ 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),
+ 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()
+ })
+
+ megatron_config = init_model_parallel_config(megatron_config)
+
+ if self._is_actor or self._is_rollout:
+ # we need the model for actor and rollout
+ if self._is_actor:
+ optim_config = self.config.actor.optim
+ else:
+ optim_config = None
+ self.actor_module, self.hybrid_engine, self.actor_optimizer, \
+ self.actor_model_config, self.actor_optim_config = self._build_model_optimizer(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ optim_config=optim_config,
+ override_model_config=override_model_config,
+ )
+
+ if self._is_actor:
+ self.actor = MegatronPPOActor(config=self.config.actor,
+ model_config=self.actor_model_config,
+ megatron_config=megatron_config,
+ actor_module=self.actor_module,
+ actor_optimizer=self.actor_optimizer,
+ actor_optimizer_config=self.actor_optim_config)
+
+ if self._is_rollout:
+ self.rollout, self.sharding_manager = self._build_rollout()
+
+ if self._is_ref:
+ self.ref_module, self.ref_model_config = self._build_model_optimizer(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ optim_config=None,
+ override_model_config=override_model_config,
+ )
+ self.ref_policy = MegatronPPOActor(config=self.config.ref,
+ model_config=self.ref_model_config,
+ megatron_config=megatron_config,
+ actor_module=self.ref_module,
+ actor_optimizer=None,
+ actor_optimizer_config=None)
+
+ torch.cuda.empty_cache()
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def update_actor(self, data: DataProto):
+ assert self._is_actor
+
+ data.batch = data.batch.cuda()
+
+ log_gpu_memory_usage('Before update policy', logger=logger)
+
+ dataloader = self.actor.make_minibatch_iterator(data=data)
+ metrics = self.actor.update_policy(dataloader=dataloader)
+
+ log_gpu_memory_usage('After update policy', logger=logger)
+
+ # TODO: here, we should return all metrics
+ output = DataProto(meta_info={'metrics': metrics})
+ output = output.to('cpu')
+ torch.cuda.empty_cache()
+ return output
+
+ # @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO)
+ # def compute_log_prob(self, data: DataProto) -> DataProto:
+ # assert self._is_rollout
+ # output = self.actor.compute_log_prob(data=data)
+ # output = DataProto.from_dict(tensors={'old_log_probs': output})
+ # torch.cuda.empty_cache()
+ # return output
+
+ @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO)
+ def generate_sequences(self, prompts: DataProto):
+ assert self._is_rollout
+
+ prompts.batch = prompts.batch.cuda()
+ meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id}
+ prompts.meta_info.update(meta_info)
+ with self.sharding_manager:
+ log_gpu_memory_usage('After entering sharding manager', logger=logger)
+
+ prompts = self.sharding_manager.preprocess_data(prompts)
+ output = self.rollout.generate_sequences(prompts=prompts)
+
+ log_gpu_memory_usage('After rollout generation', logger=logger)
+
+ output = self.sharding_manager.postprocess_data(output)
+
+ validate = prompts.meta_info.get('validate', False)
+ if self._is_actor and not validate:
+ # we should always recompute old_log_probs when it is HybridEngine
+ output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size
+ output.meta_info['temperature'] = self.config.rollout.temperature
+ old_log_probs = self.actor.compute_log_prob(data=output)
+ output.batch['old_log_probs'] = old_log_probs
+
+ output = output.to('cpu')
+ # clear kv cache
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After recompute log prob', logger=logger)
+ return output
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def compute_ref_log_prob(self, data: DataProto):
+ data = data.to('cuda')
+
+ assert self._is_ref
+ if self._is_offload_param:
+ load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad)
+
+ micro_batch_size = self.config.rollout.log_prob_micro_batch_size
+ data.meta_info['micro_batch_size'] = micro_batch_size
+ data.meta_info['temperature'] = self.config.rollout.temperature
+ output = self.ref_policy.compute_log_prob(data=data)
+ output = DataProto.from_dict(tensors={'ref_log_prob': output})
+ output = output.to('cpu')
+ if self._is_offload_param:
+ offload_megatron_param_and_grad(self.ref_module, self._is_offload_grad)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def load_checkpoint(self, checkpoint_path):
+ pass
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def load_pretrained_model(self, checkpoint_path):
+ pass
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, checkpoint_path):
+ assert self._is_actor
+ pass
+
+
+class CriticWorker(MegatronWorker):
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ # NOTE(sgm): We utilize colocate WorkerGroup by default.
+ # As a result, Workers for different model share the same process.
+ # Therefore, we only require one distribute initialization.
+ # To utilize different parallel startegy in different models:
+ # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models,
+ # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385
+ if not torch.distributed.is_initialized():
+ rank = int(os.environ['LOCAL_RANK'])
+ torch.distributed.init_process_group(backend="nccl")
+ torch.cuda.set_device(rank)
+
+ if self.config.megatron.sequence_parallel:
+ os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+ mpu.initialize_model_parallel(
+ tensor_model_parallel_size=self.config.megatron.tensor_model_parallel_size,
+ pipeline_model_parallel_size=self.config.megatron.pipeline_model_parallel_size,
+ virtual_pipeline_model_parallel_size=None,
+ pipeline_model_parallel_split_rank=None,
+ use_sharp=False,
+ context_parallel_size=1,
+ expert_model_parallel_size=1,
+ nccl_communicator_config_path=None,
+ )
+
+ set_random_seed(seed=self.config.megatron.seed)
+
+ # normalize config
+ self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
+ self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
+
+ # TODO(sgm): support critic model offload
+
+ def _build_critic_model_optimizer(self,
+ model_path,
+ megatron_config: ModelParallelConfig,
+ optim_config,
+ override_model_config,
+ enable_gradient_checkpointing=False):
+ from megatron.core.models.gpt.gpt_model import ModelType
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.megatron.optimizer import get_megatron_optimizer
+ from verl.utils.megatron_utils import get_model, init_megatron_optim_config, init_model_parallel_config
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ # Step 1: initialize the tokenizer
+ local_path = copy_local_path_from_hdfs(model_path)
+ self.tokenizer = hf_tokenizer(local_path)
+
+ # Step 2: get the actor_model_config
+ critic_model_config = AutoConfig.from_pretrained(local_path)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(critic_model_config, override_config_kwargs=override_config_kwargs)
+
+ if self.rank == 0:
+ print(f'Model config after override: {critic_model_config}')
+
+ def megatron_critic_model_provider(pre_process, post_process):
+ from verl.utils.model import get_parallel_model_from_config
+ # TODO: support vpp here
+ # vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
+ # this_megatron_config = copy.deepcopy(megatron_config)
+ # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+ parallel_model = get_parallel_model_from_config(config=critic_model_config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process,
+ value=True)
+ parallel_model.cuda()
+ return parallel_model
+
+ # Step 3: initialize the megatron model
+ critic_module = get_model(model_provider_func=megatron_critic_model_provider,
+ model_type=ModelType.encoder_or_decoder,
+ wrap_with_ddp=True)
+ # note that here critic_module will be a list to be compatible with the construction of interleaved pp (vpp).
+ # but here, we do not use pp (vpp) yet. For simplicity, we remove the list
+ # critic_module = nn.ModuleList(critic_module)
+
+ if self.config.load_weight:
+ load_megatron_model_weights(self.config,
+ critic_model_config,
+ critic_module,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=True)
+ if self.rank == 0:
+ print_model_size(critic_module[0])
+
+ # TODO: add more optimizer args into config
+ optim_config = init_megatron_optim_config(optim_config)
+ critic_optimizer = get_megatron_optimizer(model=critic_module, config=optim_config)
+ torch.cuda.empty_cache()
+ return critic_module, critic_optimizer, critic_model_config, optim_config
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # create critic
+ from omegaconf import OmegaConf
+ from verl.utils.torch_dtypes import PrecisionType
+
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+ torch_dtype = torch.bfloat16
+
+ megatron_config = OmegaConf.create({
+ 'sequence_parallel': self.config.megatron.get('sequence_parallel', True),
+ 'param_dtype': PrecisionType.to_str(torch_dtype),
+ 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),
+ 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),
+ 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),
+ 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),
+ 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()
+ })
+
+ megatron_config = init_model_parallel_config(megatron_config)
+
+ critic_module, critic_optimizer, critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ optim_config=self.config.optim,
+ override_model_config=override_model_config)
+ self.critic = MegatronPPOCritic(config=self.config,
+ model_config=critic_model_config,
+ megatron_config=megatron_config,
+ critic_module=critic_module,
+ critic_optimizer=critic_optimizer,
+ critic_optimizer_config=critic_optimizer_config)
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def compute_values(self, data: DataProto):
+ data = data.to('cuda')
+ values = self.critic.compute_values(data=data)
+ output = DataProto.from_dict(tensors={'values': values})
+ output = output.to('cpu')
+ return output
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def update_critic(self, data: DataProto):
+ data = data.to('cuda')
+ dataloader = self.critic.make_minibatch_iterator(data)
+ metrics = self.critic.update_critic(dataloader=dataloader)
+ output = DataProto(batch=None, meta_info={'metrics': metrics})
+ output = output.to('cpu')
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def load_checkpoint(self, checkpoint_path):
+ pass
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, checkpoint_path):
+ pass
+
+
+class RewardModelWorker(MegatronWorker):
+ """
+ Note that we only implement the reward model that is subclass of AutoModelForSequenceClassification.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ # NOTE(sgm): We utilize colocate WorkerGroup by default.
+ # As a result, Workers for different model share the same process.
+ # Therefore, we only require one distribute initialization.
+ # To utilize different parallel startegy in different models:
+ # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models,
+ # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385
+ if not torch.distributed.is_initialized():
+ rank = int(os.environ['LOCAL_RANK'])
+ torch.distributed.init_process_group(backend="nccl")
+ torch.cuda.set_device(rank)
+
+ if self.config.megatron.sequence_parallel:
+ os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+ mpu.initialize_model_parallel(
+ tensor_model_parallel_size=self.config.megatron.tensor_model_parallel_size,
+ pipeline_model_parallel_size=self.config.megatron.pipeline_model_parallel_size,
+ virtual_pipeline_model_parallel_size=None,
+ pipeline_model_parallel_split_rank=None,
+ use_sharp=False,
+ context_parallel_size=1,
+ expert_model_parallel_size=1,
+ nccl_communicator_config_path=None,
+ )
+
+ set_random_seed(seed=self.config.megatron.seed)
+
+ # normalize config
+ self.config.micro_batch_size //= mpu.get_data_parallel_world_size()
+
+ def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config):
+ from megatron.core.models.gpt.gpt_model import ModelType
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.megatron_utils import get_model
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ # Step 1: initialize the tokenizer
+ local_path = copy_local_path_from_hdfs(model_path)
+ self.tokenizer = hf_tokenizer(local_path)
+
+ # Step 2: get the actor_model_config
+ rm_model_config = AutoConfig.from_pretrained(local_path)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(rm_model_config, override_config_kwargs=override_config_kwargs)
+
+ if self.rank == 0:
+ print(f'Model config after override: {rm_model_config}')
+
+ def megatron_rm_model_provider(pre_process, post_process):
+ from verl.utils.model import get_parallel_model_from_config
+ # vpp is not supported yet because it will hang for some reason. Need debugging
+ vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
+ # this_megatron_config = copy.deepcopy(megatron_config)
+ # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+ parallel_model = get_parallel_model_from_config(config=rm_model_config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process,
+ value=True)
+ parallel_model.cuda()
+ return parallel_model
+
+ # Step 3: initialize the megatron model
+ reward_model = get_model(model_provider_func=megatron_rm_model_provider,
+ model_type=ModelType.encoder_or_decoder,
+ wrap_with_ddp=False)
+ # note that here critic_module will be a list to be compatible with the construction of interleaved pp (vpp).
+ # but here, we do not use pp (vpp) yet. For simplicity, we remove the list
+ # reward_model = nn.ModuleList(reward_model)
+
+ if self.config.load_weight:
+ load_megatron_model_weights(self.config,
+ rm_model_config,
+ reward_model,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=True)
+
+ # TODO: add more optimizer args into config
+ torch.cuda.empty_cache()
+ return reward_model, rm_model_config
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # create critic
+ from omegaconf import OmegaConf
+ from verl.utils.torch_dtypes import PrecisionType
+ from transformers import AutoTokenizer
+
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+
+ sft_tokenizer_local_path = copy_local_path_from_hdfs(self.config.model.input_tokenizer)
+ sft_tokenizer = hf_tokenizer(sft_tokenizer_local_path)
+ rm_tokenizer_path = self.config.model.get('rm_tokenizer', None)
+ rm_tokenizer = None
+ if rm_tokenizer_path is not None:
+ rm_tokenizer_local_path = copy_local_path_from_hdfs(rm_tokenizer_path)
+ rm_tokenizer = hf_tokenizer(rm_tokenizer_local_path)
+
+ torch_dtype = torch.bfloat16
+
+ megatron_config = OmegaConf.create({
+ 'sequence_parallel': self.config.megatron.get('sequence_parallel', True),
+ 'param_dtype': PrecisionType.to_str(torch_dtype),
+ 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),
+ 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),
+ 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),
+ 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),
+ 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()
+ })
+
+ megatron_config = init_model_parallel_config(megatron_config)
+
+ reward_model_module, reward_model_config = self._build_rm_model(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ override_model_config=override_model_config,
+ )
+ # FIXME(sgm): reward model param offload is implemented in MegatronRewardModel
+ # should be implemented in workers
+ self.rm = MegatronRewardModel(config=self.config,
+ reward_model_module=reward_model_module,
+ model_config=reward_model_config,
+ megatron_config=megatron_config,
+ sft_tokenizer=sft_tokenizer,
+ rm_tokenizer=rm_tokenizer)
+
+ # TODO: reward model use itself tokenizer instead of sft tokenizer
+ # the input_ids, responses, attention_mask and position_ids may be different!
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def compute_rm_score(self, data: DataProto):
+ data.batch = data.batch.cuda()
+ output = self.rm.compute_reward(data)
+ output = output.to('cpu')
+ return output
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/output.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..af209dadde9cd85183855f02f579d64ec3a6e363
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/output.log
@@ -0,0 +1,23 @@
+wandb: Detected [openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::WorkerDict.ref_init_model()[39m (pid=1552808, ip=172.16.34.29, actor_id=e65619ea51238e1c4c82195501000000, repr=)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py", line 399, in func
+ return getattr(self.worker_dict[key], name)(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner
+ return func(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 286, in init_model
+ from verl.workers.actor import DataParallelPPOActor
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/actor/__init__.py", line 16, in
+ from .dp_actor import DataParallelPPOActor
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/actor/dp_actor.py", line 34, in
+ from flash_attn.bert_padding import pad_input, unpad_input, rearrange, index_first_axis
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/flash_attn/__init__.py", line 3, in
+ from flash_attn.flash_attn_interface import (
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 15, in
+ import flash_attn_2_cuda as flash_attn_gpu
+ImportError: /home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/flash_attn_2_cuda.cpython-312-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/requirements.txt b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5300dcb7d76408546b372027fa1fbbd53b54e600
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/requirements.txt
@@ -0,0 +1,288 @@
+verl==0.1
+psutil==7.1.3
+colorama==0.4.6
+annotated-doc==0.0.4
+pyasn1==0.6.1
+virtualenv==20.35.4
+requests==2.32.5
+nvidia-cufft-cu12==11.0.2.54
+nvidia-cufile-cu12==1.13.1.3
+verl==0.1
+ml_dtypes==0.5.4
+opentelemetry-sdk==1.39.0
+sglang==0.5.2
+xformers==0.0.27.post2
+lm-format-enforcer==0.10.6
+typing_extensions==4.15.0
+nvidia-cusparselt-cu12==0.7.1
+openai-harmony==0.0.4
+transformers==4.56.1
+pytest==9.0.2
+psutil==7.1.3
+cupy-cuda12x==13.6.0
+tqdm==4.67.1
+onnx==1.20.0
+pybind11==3.0.1
+partial-json-parser==0.2.1.1.post7
+nvidia-nccl-cu12==2.20.5
+aiohttp-cors==0.8.1
+sniffio==1.3.1
+tensordict==0.10.0
+smart_open==7.5.0
+cffi==2.0.0
+asttokens==3.0.1
+opencensus==0.11.4
+rpds-py==0.30.0
+py-spy==0.4.1
+nvidia-nvjitlink-cu12==12.8.93
+httpx==0.28.1
+cuda-python==13.1.1
+annotated-types==0.7.0
+idna==3.11
+fsspec==2025.10.0
+parso==0.8.5
+torchvision==0.19.0
+MarkupSafe==3.0.3
+opentelemetry-api==1.39.0
+pytz==2025.2
+dnspython==2.8.0
+zipp==3.23.0
+PyYAML==6.0.3
+onnx-ir==0.1.12
+torchdata==0.11.0
+Markdown==3.10
+urllib3==2.6.1
+cuda-pathfinder==1.3.3
+nvidia-cuda-cupti-cu12==12.1.105
+httptools==0.7.1
+pyarrow==22.0.0
+opentelemetry-proto==1.39.0
+certifi==2025.11.12
+typer==0.20.0
+python-json-logger==4.0.0
+pillow==12.0.0
+cuda-bindings==13.1.1
+Werkzeug==3.1.4
+mdurl==0.1.2
+vllm==0.6.3
+referencing==0.37.0
+xxhash==3.6.0
+interegular==0.3.3
+build==1.3.0
+fastapi-cli==0.0.16
+tensorboard==2.20.0
+sentencepiece==0.2.1
+yarl==1.22.0
+opencv-fixer==0.2.5
+python-dotenv==1.2.1
+timm==1.0.16
+aiohappyeyeballs==2.6.1
+decord==0.6.0
+nvidia-cusolver-cu12==11.4.5.107
+jiter==0.12.0
+airportsdata==20250909
+nvidia-nvtx-cu12==12.1.105
+markdown-it-py==4.0.0
+torch==2.4.0
+thefuzz==0.22.1
+opencv-python-headless==4.11.0.86
+pycryptodomex==3.23.0
+pexpect==4.9.0
+distro==1.9.0
+cloudpickle==3.1.2
+mpmath==1.3.0
+antlr4-python3-runtime==4.9.3
+peft==0.18.0
+tzdata==2025.2
+accelerate==1.12.0
+watchfiles==1.1.1
+omegaconf==2.3.0
+multiprocess==0.70.18
+frozendict==2.4.7
+sympy==1.14.0
+setproctitle==1.3.7
+setuptools==79.0.1
+py-cpuinfo==9.0.0
+ipython_pygments_lexers==1.1.1
+openai==1.99.1
+outlines_core==0.2.11
+google-api-core==2.28.1
+llvmlite==0.44.0
+attrs==25.4.0
+packaging==25.0
+fastrlock==0.8.3
+astor==0.8.1
+gguf==0.10.0
+opencv-python==4.12.0.88
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cudnn-frontend==1.16.0
+pluggy==1.6.0
+compressed-tensors==0.11.0
+importlib_metadata==8.7.0
+RapidFuzz==3.14.3
+networkx==3.6.1
+httpcore==1.0.9
+pre_commit==4.5.0
+python-multipart==0.0.20
+rich==14.2.0
+onnxscript==0.3.1
+cbor2==5.7.1
+smmap==5.0.2
+numpy==1.26.4
+opentelemetry-exporter-prometheus==0.60b0
+click==8.2.1
+traitlets==5.14.3
+nvidia-curand-cu12==10.3.2.106
+pyvers==0.1.0
+huggingface-hub==0.36.0
+cfgv==3.5.0
+optree==0.18.0
+anthropic==0.75.0
+email-validator==2.3.0
+tabulate==0.9.0
+msgpack==1.1.2
+depyf==0.19.0
+numba==0.61.2
+six==1.17.0
+einops==0.8.1
+aiosignal==1.4.0
+propcache==0.4.1
+torch_memory_saver==0.0.8
+h11==0.16.0
+frozenlist==1.8.0
+pycountry==24.6.1
+modelscope==1.33.0
+sentry-sdk==2.47.0
+av==16.0.1
+stack-data==0.6.3
+typing-inspection==0.4.2
+googleapis-common-protos==1.72.0
+blake3==1.0.8
+nvidia-cudnn-cu12==9.1.0.70
+liger_kernel==0.6.4
+wrapt==2.0.1
+prompt_toolkit==3.0.52
+torchaudio==2.8.0
+identify==2.6.15
+mistral_common==1.8.6
+codetiming==1.4.0
+nodeenv==1.9.1
+platformdirs==4.5.1
+jsonschema-specifications==2025.9.1
+protobuf==6.33.2
+hydra-core==1.3.2
+absl-py==2.3.1
+tensorboard-data-server==0.7.2
+jsonschema==4.25.1
+pyasn1_modules==0.4.2
+tiktoken==0.12.0
+starlette==0.50.0
+pyproject_hooks==1.2.0
+flash_attn==2.8.1
+fastapi==0.124.2
+rsa==4.9.1
+nest-asyncio==1.6.0
+lark==1.2.2
+fastar==0.8.0
+datasets==4.4.1
+prometheus-fastapi-instrumentator==7.1.0
+nvidia-cusparse-cu12==12.1.0.106
+ruff==0.14.8
+mathruler==0.1.0
+pydantic_core==2.41.5
+pyairports==0.0.1
+ipython==9.8.0
+pynvml==13.0.1
+nvidia-cuda-nvrtc-cu12==12.1.105
+filelock==3.20.0
+loguru==0.7.3
+pandas==2.3.3
+msgspec==0.20.0
+uvicorn==0.38.0
+blobfile==3.0.0
+gitdb==4.0.12
+cachetools==6.2.2
+uv==0.9.17
+llguidance==0.7.30
+hf_transfer==0.1.9
+wcwidth==0.2.14
+aiohttp==3.13.2
+qwen-vl-utils==0.0.14
+rich-toolkit==0.17.0
+ptyprocess==0.7.0
+ipdb==0.13.13
+opencensus-context==0.1.3
+jedi==0.19.2
+soxr==1.0.0
+ray==2.52.1
+sgl-kernel==0.3.9.post2
+colorful==0.5.8
+pycparser==2.23
+charset-normalizer==3.4.4
+hf-xet==1.2.0
+dill==0.4.0
+tokenizers==0.22.1
+prometheus_client==0.23.1
+google-auth==2.43.0
+pydantic==2.12.5
+nvidia-ml-py==13.590.44
+fastapi-cloud-cli==0.6.0
+flashinfer-python==0.3.1
+orjson==3.11.5
+python-dateutil==2.9.0.post0
+GitPython==3.1.45
+triton==3.0.0
+torchao==0.9.0
+soundfile==0.13.1
+diskcache==5.6.3
+docstring_parser==0.17.0
+anyio==4.12.0
+matplotlib-inline==0.2.1
+Pygments==2.19.2
+pure_eval==0.2.3
+ninja==1.13.0
+outlines==0.0.46
+wandb==0.23.1
+regex==2025.11.3
+pyzmq==27.1.0
+iniconfig==2.3.0
+Jinja2==3.1.6
+wheel==0.45.1
+megatron-core==0.13.1
+multidict==6.7.0
+uvloop==0.22.1
+proto-plus==1.26.1
+pylatexenc==2.10
+decorator==5.2.1
+websockets==15.0.1
+shellingham==1.5.4
+lxml==6.0.2
+safetensors==0.7.0
+scipy==1.16.3
+xgrammar==0.1.25
+pybase64==1.4.3
+opentelemetry-semantic-conventions==0.60b0
+pydantic-extra-types==2.10.6
+rignore==0.7.6
+nvidia-cuda-runtime-cu12==12.1.105
+distlib==0.4.0
+executing==2.2.1
+grpcio==1.76.0
+pip==25.3
+verl==0.1
+autocommand==2.2.2
+typeguard==4.3.0
+more-itertools==10.3.0
+importlib_metadata==8.0.0
+jaraco.functools==4.0.1
+typing_extensions==4.12.2
+jaraco.text==3.12.1
+platformdirs==4.2.2
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+inflect==7.3.1
+tomli==2.0.1
+zipp==3.19.2
+backports.tarfile==1.2.0
+wheel==0.45.1
+packaging==24.2
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/wandb-metadata.json b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5a3ad86d925ba3615248e77d3094bf18265b6dc
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/wandb-metadata.json
@@ -0,0 +1,93 @@
+{
+ "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35",
+ "python": "CPython 3.12.12",
+ "startedAt": "2026-02-02T01:21:50.845281Z",
+ "args": [
+ "--node-ip-address=172.16.34.29",
+ "--node-manager-port=44253",
+ "--object-store-name=/tmp/ray/session_2026-02-01_20-21-00_836157_1537223/sockets/plasma_store",
+ "--raylet-name=/tmp/ray/session_2026-02-01_20-21-00_836157_1537223/sockets/raylet",
+ "--redis-address=None",
+ "--metrics-agent-port=62109",
+ "--logging-rotate-bytes=536870912",
+ "--logging-rotate-backup-count=5",
+ "--runtime-env-agent-port=52775",
+ "--gcs-address=172.16.34.29:61367",
+ "--session-name=session_2026-02-01_20-21-00_836157_1537223",
+ "--temp-dir=/tmp/ray",
+ "--webui=127.0.0.1:8301",
+ "--cluster-id=2549a1b366f4964ef8f23a40ed67bd8fc06a14a97f8f53cf9a7706d5",
+ "--startup-token=128",
+ "--worker-launch-time-ms=1769995275494",
+ "--node-id=abe8c189092abd663e817fc41a28c5b237c1d0ba9d4dd79fd858d0fd",
+ "--runtime-env-hash=1830736042"
+ ],
+ "program": "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/ray/_private/workers/default_worker.py",
+ "git": {
+ "remote": "https://github.com/PeterGriffinJin/Search-R1",
+ "commit": "598e61bd1d36895726d28a8d06b3a15bed19f5d3"
+ },
+ "email": "shahidulshakib034@gmail.com",
+ "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1",
+ "host": "gamma",
+ "executable": "/home/mshahidul/miniconda3/envs/verl/bin/python3",
+ "cpu_count": 64,
+ "cpu_count_logical": 128,
+ "gpu": "NVIDIA A100 80GB PCIe",
+ "gpu_count": 6,
+ "disk": {
+ "/": {
+ "total": "3766429188096",
+ "used": "184569806848"
+ }
+ },
+ "memory": {
+ "total": "1081814863872"
+ },
+ "gpu_nvidia": [
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328"
+ },
+ {
+ "name": "NVIDIA H100 PCIe",
+ "memoryTotal": "85520809984",
+ "cudaCores": 14592,
+ "architecture": "Hopper",
+ "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb"
+ },
+ {
+ "name": "NVIDIA H100 PCIe",
+ "memoryTotal": "85520809984",
+ "cudaCores": 14592,
+ "architecture": "Hopper",
+ "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece"
+ }
+ ],
+ "cudaVersion": "13.0",
+ "writerId": "5m4a15ougycb78o8erwj3yecrb7tbwn1"
+}
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-core.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..f49edc5ee7edb53bbb72a5b046b7880bac1fac64
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-core.log
@@ -0,0 +1,7 @@
+{"time":"2026-02-01T20:21:51.061260937-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpai5lb1ap/port-1545881.txt","pid":1545881,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-02-01T20:21:51.062200689-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1545881}
+{"time":"2026-02-01T20:21:51.062207345-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1545881-1551236-3300304792/socket","Net":"unix"}}
+{"time":"2026-02-01T20:21:51.22693701-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-02-01T20:21:51.242673041-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"t68srmo7","id":"1(@)"}
+{"time":"2026-02-01T20:21:52.263251526-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"t68srmo7","id":"1(@)"}
+{"time":"2026-02-01T20:22:38.856926907-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..6eb28f83e680c11c42634386c2768b87bd56582b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log
@@ -0,0 +1,6 @@
+{"time":"2026-02-01T20:21:51.244466362-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-02-01T20:21:52.25966277-05:00","level":"INFO","msg":"stream: created new stream","id":"t68srmo7"}
+{"time":"2026-02-01T20:21:52.259825063-05:00","level":"INFO","msg":"handler: started","stream_id":"t68srmo7"}
+{"time":"2026-02-01T20:21:52.26322258-05:00","level":"INFO","msg":"stream: started","id":"t68srmo7"}
+{"time":"2026-02-01T20:21:52.263274117-05:00","level":"INFO","msg":"writer: started","stream_id":"t68srmo7"}
+{"time":"2026-02-01T20:21:52.263304682-05:00","level":"INFO","msg":"sender: started","stream_id":"t68srmo7"}
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..b14f79f089787cd6d77dc9ca89080e44131ed562
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log
@@ -0,0 +1,21 @@
+2026-02-01 20:21:50,865 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Configure stats pid to 1545881
+2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings
+2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings
+2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-02-01 20:21:50,867 INFO MainThread:1545881 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log
+2026-02-01 20:21:50,867 INFO MainThread:1545881 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log
+2026-02-01 20:21:50,868 INFO MainThread:1545881 [wandb_init.py:init():841] calling init triggers
+2026-02-01 20:21:50,868 INFO MainThread:1545881 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': True}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '', 'end_state_marker': ''}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}}
+2026-02-01 20:21:50,868 INFO MainThread:1545881 [wandb_init.py:init():889] starting backend
+2026-02-01 20:21:51,227 INFO MainThread:1545881 [wandb_init.py:init():892] sending inform_init request
+2026-02-01 20:21:51,235 INFO MainThread:1545881 [wandb_init.py:init():900] backend started and connected
+2026-02-01 20:21:51,244 INFO MainThread:1545881 [wandb_init.py:init():970] updated telemetry
+2026-02-01 20:21:51,270 INFO MainThread:1545881 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-02-01 20:21:52,622 INFO MainThread:1545881 [wandb_init.py:init():1041] starting run threads in backend
+2026-02-01 20:21:53,469 INFO MainThread:1545881 [wandb_run.py:_console_start():2521] atexit reg
+2026-02-01 20:21:53,469 INFO MainThread:1545881 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-02-01 20:21:53,469 INFO MainThread:1545881 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-02-01 20:21:53,470 INFO MainThread:1545881 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-02-01 20:21:53,481 INFO MainThread:1545881 [wandb_init.py:init():1081] run started, returning control to user process
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/run-t68srmo7.wandb b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/run-t68srmo7.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32
Binary files /dev/null and b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/run-t68srmo7.wandb differ
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/output.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..224e7b99f95aed5c56df55343d05d4b6dfe0021f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/output.log
@@ -0,0 +1,20 @@
+wandb: Detected [openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::WorkerDict.ref_init_model()[39m (pid=1569209, ip=172.16.34.29, actor_id=30bbe065b8ff586d669f4f9101000000, repr=)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py", line 399, in func
+ return getattr(self.worker_dict[key], name)(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner
+ return func(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 335, in init_model
+ self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path,
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 143, in _build_model_optimizer
+ check_model_support_rmpad(actor_model_config.model_type)
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/models/registry.py", line 30, in check_model_support_rmpad
+ raise ValueError(f"Model architecture {model_type} is not supported for now. "
+ValueError: Model architecture qwen3 is not supported for now. RMPad supported architectures: dict_keys(['llama', 'mistral', 'gemma', 'qwen2']).Please set `use_remove_padding=False` in the model config.
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/requirements.txt b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2dd689031142d7d6d129a52ef2a92f2070eb5c06
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/requirements.txt
@@ -0,0 +1,288 @@
+verl==0.1
+psutil==7.1.3
+colorama==0.4.6
+annotated-doc==0.0.4
+pyasn1==0.6.1
+virtualenv==20.35.4
+requests==2.32.5
+nvidia-cufft-cu12==11.0.2.54
+nvidia-cufile-cu12==1.13.1.3
+verl==0.1
+ml_dtypes==0.5.4
+opentelemetry-sdk==1.39.0
+sglang==0.5.2
+xformers==0.0.27.post2
+lm-format-enforcer==0.10.6
+typing_extensions==4.15.0
+nvidia-cusparselt-cu12==0.7.1
+openai-harmony==0.0.4
+transformers==4.56.1
+pytest==9.0.2
+psutil==7.1.3
+cupy-cuda12x==13.6.0
+tqdm==4.67.1
+onnx==1.20.0
+pybind11==3.0.1
+partial-json-parser==0.2.1.1.post7
+nvidia-nccl-cu12==2.20.5
+aiohttp-cors==0.8.1
+sniffio==1.3.1
+tensordict==0.10.0
+smart_open==7.5.0
+cffi==2.0.0
+asttokens==3.0.1
+opencensus==0.11.4
+rpds-py==0.30.0
+py-spy==0.4.1
+nvidia-nvjitlink-cu12==12.8.93
+httpx==0.28.1
+cuda-python==13.1.1
+annotated-types==0.7.0
+idna==3.11
+fsspec==2025.10.0
+parso==0.8.5
+torchvision==0.19.0
+MarkupSafe==3.0.3
+opentelemetry-api==1.39.0
+pytz==2025.2
+dnspython==2.8.0
+zipp==3.23.0
+PyYAML==6.0.3
+onnx-ir==0.1.12
+torchdata==0.11.0
+Markdown==3.10
+urllib3==2.6.1
+cuda-pathfinder==1.3.3
+nvidia-cuda-cupti-cu12==12.1.105
+httptools==0.7.1
+pyarrow==22.0.0
+opentelemetry-proto==1.39.0
+certifi==2025.11.12
+typer==0.20.0
+python-json-logger==4.0.0
+pillow==12.0.0
+cuda-bindings==13.1.1
+Werkzeug==3.1.4
+mdurl==0.1.2
+vllm==0.6.3
+referencing==0.37.0
+xxhash==3.6.0
+interegular==0.3.3
+build==1.3.0
+fastapi-cli==0.0.16
+tensorboard==2.20.0
+sentencepiece==0.2.1
+flash_attn==2.8.3
+yarl==1.22.0
+opencv-fixer==0.2.5
+python-dotenv==1.2.1
+timm==1.0.16
+aiohappyeyeballs==2.6.1
+decord==0.6.0
+nvidia-cusolver-cu12==11.4.5.107
+jiter==0.12.0
+airportsdata==20250909
+nvidia-nvtx-cu12==12.1.105
+markdown-it-py==4.0.0
+torch==2.4.0
+thefuzz==0.22.1
+opencv-python-headless==4.11.0.86
+pycryptodomex==3.23.0
+pexpect==4.9.0
+distro==1.9.0
+cloudpickle==3.1.2
+mpmath==1.3.0
+antlr4-python3-runtime==4.9.3
+peft==0.18.0
+tzdata==2025.2
+accelerate==1.12.0
+watchfiles==1.1.1
+omegaconf==2.3.0
+multiprocess==0.70.18
+frozendict==2.4.7
+sympy==1.14.0
+setproctitle==1.3.7
+setuptools==79.0.1
+py-cpuinfo==9.0.0
+ipython_pygments_lexers==1.1.1
+openai==1.99.1
+outlines_core==0.2.11
+google-api-core==2.28.1
+llvmlite==0.44.0
+attrs==25.4.0
+packaging==25.0
+fastrlock==0.8.3
+astor==0.8.1
+gguf==0.10.0
+opencv-python==4.12.0.88
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cudnn-frontend==1.16.0
+pluggy==1.6.0
+compressed-tensors==0.11.0
+importlib_metadata==8.7.0
+RapidFuzz==3.14.3
+networkx==3.6.1
+httpcore==1.0.9
+pre_commit==4.5.0
+python-multipart==0.0.20
+rich==14.2.0
+onnxscript==0.3.1
+cbor2==5.7.1
+smmap==5.0.2
+numpy==1.26.4
+opentelemetry-exporter-prometheus==0.60b0
+click==8.2.1
+traitlets==5.14.3
+nvidia-curand-cu12==10.3.2.106
+pyvers==0.1.0
+huggingface-hub==0.36.0
+cfgv==3.5.0
+optree==0.18.0
+anthropic==0.75.0
+email-validator==2.3.0
+tabulate==0.9.0
+msgpack==1.1.2
+depyf==0.19.0
+numba==0.61.2
+six==1.17.0
+einops==0.8.1
+aiosignal==1.4.0
+propcache==0.4.1
+torch_memory_saver==0.0.8
+h11==0.16.0
+frozenlist==1.8.0
+pycountry==24.6.1
+modelscope==1.33.0
+sentry-sdk==2.47.0
+av==16.0.1
+stack-data==0.6.3
+typing-inspection==0.4.2
+googleapis-common-protos==1.72.0
+blake3==1.0.8
+nvidia-cudnn-cu12==9.1.0.70
+liger_kernel==0.6.4
+wrapt==2.0.1
+prompt_toolkit==3.0.52
+torchaudio==2.8.0
+identify==2.6.15
+mistral_common==1.8.6
+codetiming==1.4.0
+nodeenv==1.9.1
+platformdirs==4.5.1
+jsonschema-specifications==2025.9.1
+protobuf==6.33.2
+hydra-core==1.3.2
+absl-py==2.3.1
+tensorboard-data-server==0.7.2
+jsonschema==4.25.1
+pyasn1_modules==0.4.2
+tiktoken==0.12.0
+starlette==0.50.0
+pyproject_hooks==1.2.0
+fastapi==0.124.2
+rsa==4.9.1
+nest-asyncio==1.6.0
+lark==1.2.2
+fastar==0.8.0
+datasets==4.4.1
+prometheus-fastapi-instrumentator==7.1.0
+nvidia-cusparse-cu12==12.1.0.106
+ruff==0.14.8
+mathruler==0.1.0
+pydantic_core==2.41.5
+pyairports==0.0.1
+ipython==9.8.0
+pynvml==13.0.1
+nvidia-cuda-nvrtc-cu12==12.1.105
+filelock==3.20.0
+loguru==0.7.3
+pandas==2.3.3
+msgspec==0.20.0
+uvicorn==0.38.0
+blobfile==3.0.0
+gitdb==4.0.12
+cachetools==6.2.2
+uv==0.9.17
+llguidance==0.7.30
+hf_transfer==0.1.9
+wcwidth==0.2.14
+aiohttp==3.13.2
+qwen-vl-utils==0.0.14
+rich-toolkit==0.17.0
+ptyprocess==0.7.0
+ipdb==0.13.13
+opencensus-context==0.1.3
+jedi==0.19.2
+soxr==1.0.0
+ray==2.52.1
+sgl-kernel==0.3.9.post2
+colorful==0.5.8
+pycparser==2.23
+charset-normalizer==3.4.4
+hf-xet==1.2.0
+dill==0.4.0
+tokenizers==0.22.1
+prometheus_client==0.23.1
+google-auth==2.43.0
+pydantic==2.12.5
+nvidia-ml-py==13.590.44
+fastapi-cloud-cli==0.6.0
+flashinfer-python==0.3.1
+orjson==3.11.5
+python-dateutil==2.9.0.post0
+GitPython==3.1.45
+triton==3.0.0
+torchao==0.9.0
+soundfile==0.13.1
+diskcache==5.6.3
+docstring_parser==0.17.0
+anyio==4.12.0
+matplotlib-inline==0.2.1
+Pygments==2.19.2
+pure_eval==0.2.3
+ninja==1.13.0
+outlines==0.0.46
+wandb==0.23.1
+regex==2025.11.3
+pyzmq==27.1.0
+iniconfig==2.3.0
+Jinja2==3.1.6
+wheel==0.45.1
+megatron-core==0.13.1
+multidict==6.7.0
+uvloop==0.22.1
+proto-plus==1.26.1
+pylatexenc==2.10
+decorator==5.2.1
+websockets==15.0.1
+shellingham==1.5.4
+lxml==6.0.2
+safetensors==0.7.0
+scipy==1.16.3
+xgrammar==0.1.25
+pybase64==1.4.3
+opentelemetry-semantic-conventions==0.60b0
+pydantic-extra-types==2.10.6
+rignore==0.7.6
+nvidia-cuda-runtime-cu12==12.1.105
+distlib==0.4.0
+executing==2.2.1
+grpcio==1.76.0
+pip==25.3
+verl==0.1
+autocommand==2.2.2
+typeguard==4.3.0
+more-itertools==10.3.0
+importlib_metadata==8.0.0
+jaraco.functools==4.0.1
+typing_extensions==4.12.2
+jaraco.text==3.12.1
+platformdirs==4.2.2
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+inflect==7.3.1
+tomli==2.0.1
+zipp==3.19.2
+backports.tarfile==1.2.0
+wheel==0.45.1
+packaging==24.2
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/wandb-metadata.json b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ab5bffd7a3a1fe7b8138b54a8900afd686bba50
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/wandb-metadata.json
@@ -0,0 +1,93 @@
+{
+ "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35",
+ "python": "CPython 3.12.12",
+ "startedAt": "2026-02-02T01:24:56.695929Z",
+ "args": [
+ "--node-ip-address=172.16.34.29",
+ "--node-manager-port=35889",
+ "--object-store-name=/tmp/ray/session_2026-02-01_20-24-16_851383_1554534/sockets/plasma_store",
+ "--raylet-name=/tmp/ray/session_2026-02-01_20-24-16_851383_1554534/sockets/raylet",
+ "--redis-address=None",
+ "--metrics-agent-port=60951",
+ "--logging-rotate-bytes=536870912",
+ "--logging-rotate-backup-count=5",
+ "--runtime-env-agent-port=63006",
+ "--gcs-address=172.16.34.29:62587",
+ "--session-name=session_2026-02-01_20-24-16_851383_1554534",
+ "--temp-dir=/tmp/ray",
+ "--webui=127.0.0.1:8301",
+ "--cluster-id=2a6a92c2ce2c7497d2c570ee4ed306ca282172c9fb3a948d0d91f16a",
+ "--startup-token=128",
+ "--worker-launch-time-ms=1769995470827",
+ "--node-id=8646fde502441bfcc43d5303c4610d10bdcd65e2ec1b75c5626051ce",
+ "--runtime-env-hash=1830736042"
+ ],
+ "program": "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/ray/_private/workers/default_worker.py",
+ "git": {
+ "remote": "https://github.com/PeterGriffinJin/Search-R1",
+ "commit": "598e61bd1d36895726d28a8d06b3a15bed19f5d3"
+ },
+ "email": "shahidulshakib034@gmail.com",
+ "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1",
+ "host": "gamma",
+ "executable": "/home/mshahidul/miniconda3/envs/verl/bin/python3",
+ "cpu_count": 64,
+ "cpu_count_logical": 128,
+ "gpu": "NVIDIA A100 80GB PCIe",
+ "gpu_count": 6,
+ "disk": {
+ "/": {
+ "total": "3766429188096",
+ "used": "184573485056"
+ }
+ },
+ "memory": {
+ "total": "1081814863872"
+ },
+ "gpu_nvidia": [
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328"
+ },
+ {
+ "name": "NVIDIA H100 PCIe",
+ "memoryTotal": "85520809984",
+ "cudaCores": 14592,
+ "architecture": "Hopper",
+ "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb"
+ },
+ {
+ "name": "NVIDIA H100 PCIe",
+ "memoryTotal": "85520809984",
+ "cudaCores": 14592,
+ "architecture": "Hopper",
+ "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece"
+ }
+ ],
+ "cudaVersion": "13.0",
+ "writerId": "vtydy8v1vqlfqdr1gmygsbjfgg784jf6"
+}
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-core.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..01035e4956719aba11e232284dad8bcd06699791
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-core.log
@@ -0,0 +1,7 @@
+{"time":"2026-02-01T20:24:56.909555274-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpoqvy8trh/port-1562972.txt","pid":1562972,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-02-01T20:24:56.910726586-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1562972}
+{"time":"2026-02-01T20:24:56.91070461-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1562972-1568004-2806483583/socket","Net":"unix"}}
+{"time":"2026-02-01T20:24:57.077884409-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-02-01T20:24:57.090876423-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"lmw144t2","id":"1(@)"}
+{"time":"2026-02-01T20:24:58.917447274-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"lmw144t2","id":"1(@)"}
+{"time":"2026-02-01T20:25:47.319167155-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..1062a599f72886f79baca3122e696d65c44cecbd
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log
@@ -0,0 +1,6 @@
+{"time":"2026-02-01T20:24:57.092208445-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-02-01T20:24:58.913739618-05:00","level":"INFO","msg":"stream: created new stream","id":"lmw144t2"}
+{"time":"2026-02-01T20:24:58.913897747-05:00","level":"INFO","msg":"handler: started","stream_id":"lmw144t2"}
+{"time":"2026-02-01T20:24:58.91742272-05:00","level":"INFO","msg":"stream: started","id":"lmw144t2"}
+{"time":"2026-02-01T20:24:58.917508674-05:00","level":"INFO","msg":"writer: started","stream_id":"lmw144t2"}
+{"time":"2026-02-01T20:24:58.91751844-05:00","level":"INFO","msg":"sender: started","stream_id":"lmw144t2"}
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..d18fa765e5d66f927fd81e9e5bf17d79c139c003
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log
@@ -0,0 +1,21 @@
+2026-02-01 20:24:56,717 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-02-01 20:24:56,717 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Configure stats pid to 1562972
+2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings
+2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings
+2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log
+2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log
+2026-02-01 20:24:56,719 INFO MainThread:1562972 [wandb_init.py:init():841] calling init triggers
+2026-02-01 20:24:56,719 INFO MainThread:1562972 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': True}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '', 'end_state_marker': ''}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}}
+2026-02-01 20:24:56,719 INFO MainThread:1562972 [wandb_init.py:init():889] starting backend
+2026-02-01 20:24:57,078 INFO MainThread:1562972 [wandb_init.py:init():892] sending inform_init request
+2026-02-01 20:24:57,086 INFO MainThread:1562972 [wandb_init.py:init():900] backend started and connected
+2026-02-01 20:24:57,092 INFO MainThread:1562972 [wandb_init.py:init():970] updated telemetry
+2026-02-01 20:24:57,114 INFO MainThread:1562972 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-02-01 20:24:59,139 INFO MainThread:1562972 [wandb_init.py:init():1041] starting run threads in backend
+2026-02-01 20:24:59,972 INFO MainThread:1562972 [wandb_run.py:_console_start():2521] atexit reg
+2026-02-01 20:24:59,973 INFO MainThread:1562972 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-02-01 20:24:59,973 INFO MainThread:1562972 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-02-01 20:24:59,973 INFO MainThread:1562972 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-02-01 20:24:59,985 INFO MainThread:1562972 [wandb_init.py:init():1081] run started, returning control to user process
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/run-lmw144t2.wandb b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/run-lmw144t2.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32
Binary files /dev/null and b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/run-lmw144t2.wandb differ
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/output.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..b0055b3731d82dd99f43f1be90744402fc3ac222
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/output.log
@@ -0,0 +1,53 @@
+wandb: Detected [openai] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::WorkerDict.actor_rollout_init_model()[39m (pid=1584886, ip=172.16.34.29, actor_id=73dee5b169bd353a8f66401d01000000, repr=)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py", line 399, in func
+ return getattr(self.worker_dict[key], name)(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner
+ return func(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 332, in init_model
+ self.rollout, self.rollout_sharding_manager = self._build_rollout()
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 268, in _build_rollout
+ rollout = vLLMRollout(actor_module=self.actor_module_fsdp,
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/vllm_rollout.py", line 91, in __init__
+ self.inference_engine = LLM(actor_module,
+ ^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm.py", line 142, in __init__
+ self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args) # TODO: check usagecontext
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py", line 382, in from_engine_args
+ engine_config = engine_args.create_engine_config()
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py", line 71, in create_engine_config
+ engine_config = super().create_engine_config()
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/engine/arg_utils.py", line 900, in create_engine_config
+ model_config = self.create_model_config()
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py", line 34, in create_model_config
+ return ModelConfig(
+ ^^^^^^^^^^^^
+ File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/config.py", line 47, in __init__
+ super().__init__(model=hf_config._name_or_path, tokenizer=hf_config._name_or_path, *args, **kwargs)
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/config.py", line 194, in __init__
+ self.multimodal_config = self._init_multimodal_config(
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/config.py", line 213, in _init_multimodal_config
+ if ModelRegistry.is_multimodal_model(architectures):
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/model_executor/models/registry.py", line 384, in is_multimodal_model
+ return self.inspect_model_cls(architectures).supports_multimodal
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/model_executor/models/registry.py", line 353, in inspect_model_cls
+ return self._raise_for_unsupported(architectures)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/model_executor/models/registry.py", line 314, in _raise_for_unsupported
+ raise ValueError(
+ValueError: Model architectures ['Qwen3ForCausalLM'] are not supported for now. Supported architectures: ['AquilaModel', 'AquilaForCausalLM', 'ArcticForCausalLM', 'BaiChuanForCausalLM', 'BaichuanForCausalLM', 'BloomForCausalLM', 'CohereForCausalLM', 'DbrxForCausalLM', 'DeciLMForCausalLM', 'DeepseekForCausalLM', 'DeepseekV2ForCausalLM', 'ExaoneForCausalLM', 'FalconForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTJForCausalLM', 'GPTNeoXForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'InternLMForCausalLM', 'InternLM2ForCausalLM', 'JAISLMHeadModel', 'JambaForCausalLM', 'LlamaForCausalLM', 'LLaMAForCausalLM', 'MambaForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'QuantMixtralForCausalLM', 'MptForCausalLM', 'MPTForCausalLM', 'MiniCPMForCausalLM', 'MiniCPM3ForCausalLM', 'NemotronForCausalLM', 'OlmoForCausalLM', 'OlmoeForCausalLM', 'OPTForCausalLM', 'OrionForCausalLM', 'PersimmonForCausalLM', 'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3SmallForCausalLM', 'PhiMoEForCausalLM', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'RWForCausalLM', 'StableLMEpochForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'SolarForCausalLM', 'XverseForCausalLM', 'BartModel', 'BartForConditionalGeneration', 'MistralModel', 'Qwen2ForRewardModel', 'Gemma2Model', 'Blip2ForConditionalGeneration', 'ChameleonForConditionalGeneration', 'ChatGLMModel', 'ChatGLMForConditionalGeneration', 'FuyuForCausalLM', 'InternVLChatModel', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration', 'LlavaNextVideoForConditionalGeneration', 'LlavaOnevisionForConditionalGeneration', 'MiniCPMV', 'MolmoForCausalLM', 'NVLM_D', 'PaliGemmaForConditionalGeneration', 'Phi3VForCausalLM', 'PixtralForConditionalGeneration', 'QWenLMHeadModel', 'Qwen2VLForConditionalGeneration', 'UltravoxModel', 'MllamaForConditionalGeneration', 'EAGLEModel', 'MedusaModel', 'MLPSpeculatorPreTrainedModel']
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/requirements.txt b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2dd689031142d7d6d129a52ef2a92f2070eb5c06
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/requirements.txt
@@ -0,0 +1,288 @@
+verl==0.1
+psutil==7.1.3
+colorama==0.4.6
+annotated-doc==0.0.4
+pyasn1==0.6.1
+virtualenv==20.35.4
+requests==2.32.5
+nvidia-cufft-cu12==11.0.2.54
+nvidia-cufile-cu12==1.13.1.3
+verl==0.1
+ml_dtypes==0.5.4
+opentelemetry-sdk==1.39.0
+sglang==0.5.2
+xformers==0.0.27.post2
+lm-format-enforcer==0.10.6
+typing_extensions==4.15.0
+nvidia-cusparselt-cu12==0.7.1
+openai-harmony==0.0.4
+transformers==4.56.1
+pytest==9.0.2
+psutil==7.1.3
+cupy-cuda12x==13.6.0
+tqdm==4.67.1
+onnx==1.20.0
+pybind11==3.0.1
+partial-json-parser==0.2.1.1.post7
+nvidia-nccl-cu12==2.20.5
+aiohttp-cors==0.8.1
+sniffio==1.3.1
+tensordict==0.10.0
+smart_open==7.5.0
+cffi==2.0.0
+asttokens==3.0.1
+opencensus==0.11.4
+rpds-py==0.30.0
+py-spy==0.4.1
+nvidia-nvjitlink-cu12==12.8.93
+httpx==0.28.1
+cuda-python==13.1.1
+annotated-types==0.7.0
+idna==3.11
+fsspec==2025.10.0
+parso==0.8.5
+torchvision==0.19.0
+MarkupSafe==3.0.3
+opentelemetry-api==1.39.0
+pytz==2025.2
+dnspython==2.8.0
+zipp==3.23.0
+PyYAML==6.0.3
+onnx-ir==0.1.12
+torchdata==0.11.0
+Markdown==3.10
+urllib3==2.6.1
+cuda-pathfinder==1.3.3
+nvidia-cuda-cupti-cu12==12.1.105
+httptools==0.7.1
+pyarrow==22.0.0
+opentelemetry-proto==1.39.0
+certifi==2025.11.12
+typer==0.20.0
+python-json-logger==4.0.0
+pillow==12.0.0
+cuda-bindings==13.1.1
+Werkzeug==3.1.4
+mdurl==0.1.2
+vllm==0.6.3
+referencing==0.37.0
+xxhash==3.6.0
+interegular==0.3.3
+build==1.3.0
+fastapi-cli==0.0.16
+tensorboard==2.20.0
+sentencepiece==0.2.1
+flash_attn==2.8.3
+yarl==1.22.0
+opencv-fixer==0.2.5
+python-dotenv==1.2.1
+timm==1.0.16
+aiohappyeyeballs==2.6.1
+decord==0.6.0
+nvidia-cusolver-cu12==11.4.5.107
+jiter==0.12.0
+airportsdata==20250909
+nvidia-nvtx-cu12==12.1.105
+markdown-it-py==4.0.0
+torch==2.4.0
+thefuzz==0.22.1
+opencv-python-headless==4.11.0.86
+pycryptodomex==3.23.0
+pexpect==4.9.0
+distro==1.9.0
+cloudpickle==3.1.2
+mpmath==1.3.0
+antlr4-python3-runtime==4.9.3
+peft==0.18.0
+tzdata==2025.2
+accelerate==1.12.0
+watchfiles==1.1.1
+omegaconf==2.3.0
+multiprocess==0.70.18
+frozendict==2.4.7
+sympy==1.14.0
+setproctitle==1.3.7
+setuptools==79.0.1
+py-cpuinfo==9.0.0
+ipython_pygments_lexers==1.1.1
+openai==1.99.1
+outlines_core==0.2.11
+google-api-core==2.28.1
+llvmlite==0.44.0
+attrs==25.4.0
+packaging==25.0
+fastrlock==0.8.3
+astor==0.8.1
+gguf==0.10.0
+opencv-python==4.12.0.88
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cudnn-frontend==1.16.0
+pluggy==1.6.0
+compressed-tensors==0.11.0
+importlib_metadata==8.7.0
+RapidFuzz==3.14.3
+networkx==3.6.1
+httpcore==1.0.9
+pre_commit==4.5.0
+python-multipart==0.0.20
+rich==14.2.0
+onnxscript==0.3.1
+cbor2==5.7.1
+smmap==5.0.2
+numpy==1.26.4
+opentelemetry-exporter-prometheus==0.60b0
+click==8.2.1
+traitlets==5.14.3
+nvidia-curand-cu12==10.3.2.106
+pyvers==0.1.0
+huggingface-hub==0.36.0
+cfgv==3.5.0
+optree==0.18.0
+anthropic==0.75.0
+email-validator==2.3.0
+tabulate==0.9.0
+msgpack==1.1.2
+depyf==0.19.0
+numba==0.61.2
+six==1.17.0
+einops==0.8.1
+aiosignal==1.4.0
+propcache==0.4.1
+torch_memory_saver==0.0.8
+h11==0.16.0
+frozenlist==1.8.0
+pycountry==24.6.1
+modelscope==1.33.0
+sentry-sdk==2.47.0
+av==16.0.1
+stack-data==0.6.3
+typing-inspection==0.4.2
+googleapis-common-protos==1.72.0
+blake3==1.0.8
+nvidia-cudnn-cu12==9.1.0.70
+liger_kernel==0.6.4
+wrapt==2.0.1
+prompt_toolkit==3.0.52
+torchaudio==2.8.0
+identify==2.6.15
+mistral_common==1.8.6
+codetiming==1.4.0
+nodeenv==1.9.1
+platformdirs==4.5.1
+jsonschema-specifications==2025.9.1
+protobuf==6.33.2
+hydra-core==1.3.2
+absl-py==2.3.1
+tensorboard-data-server==0.7.2
+jsonschema==4.25.1
+pyasn1_modules==0.4.2
+tiktoken==0.12.0
+starlette==0.50.0
+pyproject_hooks==1.2.0
+fastapi==0.124.2
+rsa==4.9.1
+nest-asyncio==1.6.0
+lark==1.2.2
+fastar==0.8.0
+datasets==4.4.1
+prometheus-fastapi-instrumentator==7.1.0
+nvidia-cusparse-cu12==12.1.0.106
+ruff==0.14.8
+mathruler==0.1.0
+pydantic_core==2.41.5
+pyairports==0.0.1
+ipython==9.8.0
+pynvml==13.0.1
+nvidia-cuda-nvrtc-cu12==12.1.105
+filelock==3.20.0
+loguru==0.7.3
+pandas==2.3.3
+msgspec==0.20.0
+uvicorn==0.38.0
+blobfile==3.0.0
+gitdb==4.0.12
+cachetools==6.2.2
+uv==0.9.17
+llguidance==0.7.30
+hf_transfer==0.1.9
+wcwidth==0.2.14
+aiohttp==3.13.2
+qwen-vl-utils==0.0.14
+rich-toolkit==0.17.0
+ptyprocess==0.7.0
+ipdb==0.13.13
+opencensus-context==0.1.3
+jedi==0.19.2
+soxr==1.0.0
+ray==2.52.1
+sgl-kernel==0.3.9.post2
+colorful==0.5.8
+pycparser==2.23
+charset-normalizer==3.4.4
+hf-xet==1.2.0
+dill==0.4.0
+tokenizers==0.22.1
+prometheus_client==0.23.1
+google-auth==2.43.0
+pydantic==2.12.5
+nvidia-ml-py==13.590.44
+fastapi-cloud-cli==0.6.0
+flashinfer-python==0.3.1
+orjson==3.11.5
+python-dateutil==2.9.0.post0
+GitPython==3.1.45
+triton==3.0.0
+torchao==0.9.0
+soundfile==0.13.1
+diskcache==5.6.3
+docstring_parser==0.17.0
+anyio==4.12.0
+matplotlib-inline==0.2.1
+Pygments==2.19.2
+pure_eval==0.2.3
+ninja==1.13.0
+outlines==0.0.46
+wandb==0.23.1
+regex==2025.11.3
+pyzmq==27.1.0
+iniconfig==2.3.0
+Jinja2==3.1.6
+wheel==0.45.1
+megatron-core==0.13.1
+multidict==6.7.0
+uvloop==0.22.1
+proto-plus==1.26.1
+pylatexenc==2.10
+decorator==5.2.1
+websockets==15.0.1
+shellingham==1.5.4
+lxml==6.0.2
+safetensors==0.7.0
+scipy==1.16.3
+xgrammar==0.1.25
+pybase64==1.4.3
+opentelemetry-semantic-conventions==0.60b0
+pydantic-extra-types==2.10.6
+rignore==0.7.6
+nvidia-cuda-runtime-cu12==12.1.105
+distlib==0.4.0
+executing==2.2.1
+grpcio==1.76.0
+pip==25.3
+verl==0.1
+autocommand==2.2.2
+typeguard==4.3.0
+more-itertools==10.3.0
+importlib_metadata==8.0.0
+jaraco.functools==4.0.1
+typing_extensions==4.12.2
+jaraco.text==3.12.1
+platformdirs==4.2.2
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+inflect==7.3.1
+tomli==2.0.1
+zipp==3.19.2
+backports.tarfile==1.2.0
+wheel==0.45.1
+packaging==24.2
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/wandb-metadata.json b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cf15a6570ef487e4a89d88ebc3824111f0f9fdf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/wandb-metadata.json
@@ -0,0 +1,93 @@
+{
+ "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35",
+ "python": "CPython 3.12.12",
+ "startedAt": "2026-02-02T01:27:25.847668Z",
+ "args": [
+ "--node-ip-address=172.16.34.29",
+ "--node-manager-port=42463",
+ "--object-store-name=/tmp/ray/session_2026-02-01_20-26-45_640792_1570462/sockets/plasma_store",
+ "--raylet-name=/tmp/ray/session_2026-02-01_20-26-45_640792_1570462/sockets/raylet",
+ "--redis-address=None",
+ "--metrics-agent-port=52627",
+ "--logging-rotate-bytes=536870912",
+ "--logging-rotate-backup-count=5",
+ "--runtime-env-agent-port=50340",
+ "--gcs-address=172.16.34.29:59784",
+ "--session-name=session_2026-02-01_20-26-45_640792_1570462",
+ "--temp-dir=/tmp/ray",
+ "--webui=127.0.0.1:8301",
+ "--cluster-id=aea41d36cae02be0d21d983bc0d205680cba8b3ba963a20925793ff7",
+ "--startup-token=128",
+ "--worker-launch-time-ms=1769995620433",
+ "--node-id=43683b8f20fc380a586055e760eed6ea68a97aefc99ec5eb0e1aaf3e",
+ "--runtime-env-hash=1830736042"
+ ],
+ "program": "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/ray/_private/workers/default_worker.py",
+ "git": {
+ "remote": "https://github.com/PeterGriffinJin/Search-R1",
+ "commit": "598e61bd1d36895726d28a8d06b3a15bed19f5d3"
+ },
+ "email": "shahidulshakib034@gmail.com",
+ "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1",
+ "host": "gamma",
+ "executable": "/home/mshahidul/miniconda3/envs/verl/bin/python3",
+ "cpu_count": 64,
+ "cpu_count_logical": 128,
+ "gpu": "NVIDIA A100 80GB PCIe",
+ "gpu_count": 6,
+ "disk": {
+ "/": {
+ "total": "3766429188096",
+ "used": "184577196032"
+ }
+ },
+ "memory": {
+ "total": "1081814863872"
+ },
+ "gpu_nvidia": [
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538"
+ },
+ {
+ "name": "NVIDIA A100 80GB PCIe",
+ "memoryTotal": "85899345920",
+ "cudaCores": 6912,
+ "architecture": "Ampere",
+ "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328"
+ },
+ {
+ "name": "NVIDIA H100 PCIe",
+ "memoryTotal": "85520809984",
+ "cudaCores": 14592,
+ "architecture": "Hopper",
+ "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb"
+ },
+ {
+ "name": "NVIDIA H100 PCIe",
+ "memoryTotal": "85520809984",
+ "cudaCores": 14592,
+ "architecture": "Hopper",
+ "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece"
+ }
+ ],
+ "cudaVersion": "13.0",
+ "writerId": "6aeyut6ybrvvbk4fszgmutrk1al0827k"
+}
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-core.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..999c4ddf8a791081b9d41c2e39eb6bf37be29d3e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-core.log
@@ -0,0 +1,7 @@
+{"time":"2026-02-01T20:27:26.085930508-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpf6067hxz/port-1578907.txt","pid":1578907,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-02-01T20:27:26.089468105-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1578907}
+{"time":"2026-02-01T20:27:26.08946884-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1578907-1584383-2031627458/socket","Net":"unix"}}
+{"time":"2026-02-01T20:27:26.251033877-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-02-01T20:27:26.267228916-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"lly0j9zs","id":"1(@)"}
+{"time":"2026-02-01T20:27:27.695521129-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"lly0j9zs","id":"1(@)"}
+{"time":"2026-02-01T20:28:55.044327673-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..303163b4a3ed9a27addcc89f564458d66d92cea4
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log
@@ -0,0 +1,6 @@
+{"time":"2026-02-01T20:27:26.269116545-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-02-01T20:27:27.692526697-05:00","level":"INFO","msg":"stream: created new stream","id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.692680073-05:00","level":"INFO","msg":"handler: started","stream_id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.695494454-05:00","level":"INFO","msg":"stream: started","id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.69557747-05:00","level":"INFO","msg":"writer: started","stream_id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.695701035-05:00","level":"INFO","msg":"sender: started","stream_id":"lly0j9zs"}
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..8df0f98e5da5d448c64dbafc1ef3703811880cd5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log
@@ -0,0 +1,21 @@
+2026-02-01 20:27:25,874 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-02-01 20:27:25,874 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Configure stats pid to 1578907
+2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings
+2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings
+2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log
+2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log
+2026-02-01 20:27:25,876 INFO MainThread:1578907 [wandb_init.py:init():841] calling init triggers
+2026-02-01 20:27:25,876 INFO MainThread:1578907 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': False}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '', 'end_state_marker': ''}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}}
+2026-02-01 20:27:25,876 INFO MainThread:1578907 [wandb_init.py:init():889] starting backend
+2026-02-01 20:27:26,251 INFO MainThread:1578907 [wandb_init.py:init():892] sending inform_init request
+2026-02-01 20:27:26,261 INFO MainThread:1578907 [wandb_init.py:init():900] backend started and connected
+2026-02-01 20:27:26,270 INFO MainThread:1578907 [wandb_init.py:init():970] updated telemetry
+2026-02-01 20:27:26,293 INFO MainThread:1578907 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-02-01 20:27:27,908 INFO MainThread:1578907 [wandb_init.py:init():1041] starting run threads in backend
+2026-02-01 20:27:28,715 INFO MainThread:1578907 [wandb_run.py:_console_start():2521] atexit reg
+2026-02-01 20:27:28,716 INFO MainThread:1578907 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-02-01 20:27:28,716 INFO MainThread:1578907 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-02-01 20:27:28,716 INFO MainThread:1578907 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-02-01 20:27:28,726 INFO MainThread:1578907 [wandb_init.py:init():1081] run started, returning control to user process
diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/run-lly0j9zs.wandb b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/run-lly0j9zs.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32
Binary files /dev/null and b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/run-lly0j9zs.wandb differ
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 32
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.6
+ ignore_eos: false
+ enforce_eager: false
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 20
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 5
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de72ae678462cbdd9eff945fc9c5cf1e363eb8af
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=20
+ - trainer.test_freq=5
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..857e0a79019b5711eb7377126a063d42afed23fb
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 32
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.8
+ ignore_eos: false
+ enforce_eager: false
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 20
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 5
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8abfafb0f6bb0261095aa43a6e040c407d8f111
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.8
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=20
+ - trainer.test_freq=5
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.8,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1ea253ab12d257896733b5b02335994363d0ff7e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.8
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0c2bc7bf17311667e76f7481048c869e0814be5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 32
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.85
+ ignore_eos: false
+ enforce_eager: false
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 20
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 5
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92b5144edb482d95b09b7fafae1228062bd4358b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.85
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=20
+ - trainer.test_freq=5
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.85,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2890e9a59c960ee8661bdbdd6ed0b91b44ed0d12
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/overrides.yaml
@@ -0,0 +1,44 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=False
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.85
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=False
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=20
+- trainer.test_freq=5
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1325dcb8376c0482fa06f7fa92cd5021b0e4aa01
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/hydra.yaml
@@ -0,0 +1,211 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=False
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=False
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=20
+ - trainer.test_freq=5
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: true
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 16
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-en
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 5
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 10
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..760d4957b3736a567e9cdd8914ee58513bd7aca6
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.rollout.enforce_eager=True
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-en
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=5
+ - trainer.test_freq=10
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: true
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 16
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-en
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 5
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 10
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41a5c80023d58fa230bd12905b1088c3c96f960d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.rollout.enforce_eager=True
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-en
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=5
+ - trainer.test_freq=10
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: true
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 16
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-en
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 5
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 10
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08f9c1747d2eba6bdf69386139b1a462af7ac88f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.rollout.enforce_eager=True
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-en
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=5
+ - trainer.test_freq=10
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/main_ppo.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: true
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 16
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-en
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 5
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 10
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: true
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 16
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-en
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 5
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 10
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4278d13794d03e5c28a182f64003e9597bceda16
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.rollout.enforce_eager=True
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-en
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=5
+ - trainer.test_freq=10
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+- trainer.total_epochs=15
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/config.yaml
@@ -0,0 +1,649 @@
+actor_rollout_ref:
+ actor:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-06
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: true
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: 16
+ use_dynamic_bsz: false
+ ppo_max_token_len_per_gpu: 16384
+ clip_ratio: 0.2
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.2
+ tau_pos: 1.0
+ tau_neg: 1.05
+ freeze_vision_tower: false
+ policy_loss:
+ _target_: verl.workers.config.PolicyLossConfig
+ loss_mode: vanilla
+ clip_cov_ratio: 0.0002
+ clip_cov_lb: 1.0
+ clip_cov_ub: 5.0
+ kl_cov_ratio: 0.0002
+ ppo_kl_coef: 0.1
+ clip_ratio_c: 3.0
+ loss_agg_mode: token-mean
+ loss_scale_factor: null
+ entropy_coeff: 0
+ calculate_entropy: false
+ use_kl_loss: true
+ use_prefix_grouper: false
+ use_torch_compile: true
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ ppo_epochs: 1
+ shuffle: false
+ data_loader_seed: 42
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ grad_clip: 1.0
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+ calculate_sum_pi_squared: false
+ sum_pi_squared_checkpointing: false
+ ref:
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: ${actor_rollout_ref.actor.strategy}
+ use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ router_replay:
+ _target_: verl.workers.config.RouterReplayConfig
+ mode: disabled
+ record_file: null
+ replay_file: null
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: true
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: true
+ strategy: fsdp
+ dtype: bfloat16
+ _target_: verl.workers.config.FSDPActorConfig
+ ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+ entropy_from_logits_with_chunking: false
+ entropy_checkpointing: false
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: vllm
+ mode: async
+ temperature: 1.0
+ top_k: -1
+ top_p: 1
+ prompt_length: ${oc.select:data.max_prompt_length,512}
+ response_length: ${oc.select:data.max_response_length,512}
+ dtype: bfloat16
+ gpu_memory_utilization: 0.4
+ ignore_eos: false
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ tensor_model_parallel_size: 1
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ pipeline_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_model_len: 8192
+ max_num_seqs: 1024
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ logprobs_mode: processed_logprobs
+ scheduling_policy: fcfs
+ load_format: dummy
+ log_prob_micro_batch_size: null
+ log_prob_micro_batch_size_per_gpu: 32
+ log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+ disable_log_stats: true
+ do_sample: true
+ 'n': 3
+ over_sample_rate: 0
+ multi_stage_wake_up: false
+ engine_kwargs:
+ vllm: {}
+ sglang: {}
+ trtllm: {}
+ val_kwargs:
+ _target_: verl.workers.config.SamplingConfig
+ top_k: -1
+ top_p: 1.0
+ temperature: 0
+ 'n': 1
+ do_sample: false
+ multi_turn:
+ _target_: verl.workers.config.MultiTurnConfig
+ enable: false
+ max_assistant_turns: null
+ tool_config_path: null
+ max_user_turns: null
+ max_parallel_calls: 1
+ max_tool_response_length: 256
+ tool_response_truncate_side: middle
+ interaction_config_path: null
+ use_inference_chat_template: false
+ tokenization_sanity_check_mode: strict
+ format: hermes
+ num_repeat_rollouts: null
+ calculate_log_probs: false
+ agent:
+ _target_: verl.workers.config.AgentLoopConfig
+ num_workers: 8
+ default_agent_loop: single_turn_agent
+ agent_loop_config_path: null
+ custom_async_server:
+ _target_: verl.workers.config.CustomAsyncServerConfig
+ path: null
+ name: null
+ checkpoint_engine:
+ _target_: verl.workers.config.CheckpointEngineConfig
+ backend: naive
+ update_weights_bucket_megabytes: 2048
+ engine_kwargs: {}
+ trace:
+ _target_: verl.workers.config.TraceConfig
+ backend: null
+ token2text: false
+ max_samples_per_step_per_worker: null
+ skip_rollout: false
+ skip_dump_dir: /tmp/rollout_dump
+ skip_tokenizer_init: true
+ enable_rollout_routing_replay: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+ all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+ ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ prometheus:
+ _target_: verl.workers.config.PrometheusConfig
+ enable: false
+ port: 9090
+ file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+ served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+ quantization: null
+ quantization_config_file: null
+ mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+ layered_summon: false
+ model:
+ _target_: verl.workers.config.HFModelConfig
+ path: Qwen/Qwen3-4B-Instruct-2507
+ hf_config_path: null
+ tokenizer_path: null
+ use_shm: false
+ trust_remote_code: false
+ custom_chat_template: null
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: true
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ exclude_modules: null
+ lora_adapter_path: null
+ use_liger: false
+ use_fused_kernels: false
+ fused_kernel_options:
+ impl_backend: torch
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ mtp:
+ _target_: verl.workers.config.MtpConfig
+ enable: false
+ enable_train: false
+ enable_rollout: false
+ detach_encoder: false
+ mtp_loss_scaling_factor: 0.1
+ speculative_algorithm: EAGLE
+ speculative_num_steps: 3
+ speculative_eagle_topk: 1
+ speculative_num_draft_tokens: 4
+ method: mtp
+ num_speculative_tokens: 1
+ hybrid_engine: true
+ nccl_timeout: 600
+data:
+ tokenizer: null
+ use_shm: false
+ train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ train_max_samples: -1
+ val_max_samples: -1
+ prompt_key: prompt
+ reward_fn_key: data_source
+ max_prompt_length: 1024
+ max_response_length: 2048
+ train_batch_size: 512
+ val_batch_size: null
+ tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+ null}
+ return_raw_input_ids: false
+ return_raw_chat: true
+ return_full_prompt: false
+ shuffle: true
+ seed: null
+ dataloader_num_workers: 8
+ image_patch_size: 14
+ validation_shuffle: false
+ filter_overlong_prompts: true
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ video_key: videos
+ trust_remote_code: false
+ custom_cls:
+ path: null
+ name: null
+ return_multi_modal_inputs: true
+ sampler:
+ class_path: null
+ class_name: null
+ datagen:
+ path: null
+ name: null
+ apply_chat_template_kwargs: {}
+reward_manager:
+ _target_: verl.trainer.config.config.RewardManagerConfig
+ source: register
+ name: ${oc.select:reward_model.reward_manager,naive}
+ module:
+ _target_: verl.trainer.config.config.ModuleConfig
+ path: null
+ name: custom_reward_manager
+critic:
+ optim:
+ _target_: verl.workers.config.FSDPOptimizerConfig
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1.0e-05
+ lr_warmup_steps_ratio: 0.0
+ total_training_steps: -1
+ weight_decay: 0.01
+ lr_warmup_steps: -1
+ betas:
+ - 0.9
+ - 0.999
+ clip_grad: 1.0
+ min_lr_ratio: 0.0
+ num_cycles: 0.5
+ lr_scheduler_type: constant
+ warmup_style: null
+ override_optimizer_config: null
+ model:
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ optimizer_offload: false
+ offload_policy: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ model_dtype: fp32
+ use_orig_params: false
+ seed: 42
+ full_determinism: false
+ ulysses_sequence_parallel_size: 1
+ entropy_from_logits_with_chunking: false
+ use_torch_compile: true
+ entropy_checkpointing: false
+ forward_only: false
+ strategy: fsdp
+ dtype: bfloat16
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
+ override_config: {}
+ external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
+ trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
+ _target_: verl.workers.config.FSDPCriticModelCfg
+ use_shm: false
+ enable_gradient_checkpointing: true
+ enable_activation_offload: false
+ use_remove_padding: false
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ tiled_mlp:
+ enabled: false
+ num_shards: 4
+ _target_: verl.workers.config.FSDPCriticConfig
+ rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+ strategy: fsdp
+ enable: null
+ ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+ ppo_micro_batch_size: null
+ ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+ use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+ ppo_max_token_len_per_gpu: 32768
+ forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+ shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+ data_loader_seed: 42
+ cliprange_value: 0.5
+ loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+ checkpoint:
+ _target_: verl.trainer.config.CheckpointConfig
+ save_contents:
+ - model
+ - optimizer
+ - extra
+ load_contents: ${.save_contents}
+ async_save: false
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+ npu:
+ _target_: verl.utils.profiler.config.NPUToolConfig
+ contents: []
+ level: level0
+ analysis: true
+ discrete: false
+ torch:
+ _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+ contents: []
+ discrete: false
+ torch_memory:
+ _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+ trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+ stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+ forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+ forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+ ulysses_sequence_parallel_size: 1
+ grad_clip: 1.0
+reward_model:
+ enable: false
+ enable_resource_pool: false
+ n_gpus_per_node: 8
+ nnodes: 0
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path}
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: false
+ override_config: {}
+ use_shm: false
+ use_remove_padding: false
+ use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+ fsdp_config:
+ _target_: verl.workers.config.FSDPEngineConfig
+ wrap_policy:
+ min_num_params: 0
+ param_offload: false
+ reshard_after_forward: true
+ fsdp_size: -1
+ forward_prefetch: false
+ micro_batch_size: null
+ micro_batch_size_per_gpu: null
+ max_length: null
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ reward_manager: naive
+ reward_loop_source: register
+ reward_loop_module_path: null
+ reward_loop_class_name: null
+ launch_reward_fn_async: false
+ sandbox_fusion:
+ url: null
+ max_concurrent: 64
+ memory_limit_mb: 1024
+ profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: ${oc.select:global_profiler.tool,null}
+ enable: false
+ all_ranks: false
+ ranks: []
+ save_path: ${oc.select:global_profiler.save_path,null}
+ tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
+ ulysses_sequence_parallel_size: 1
+ use_reward_loop: true
+ num_workers: 1
+ rollout:
+ _target_: verl.workers.config.RolloutConfig
+ name: ???
+ dtype: bfloat16
+ gpu_memory_utilization: 0.5
+ enforce_eager: true
+ cudagraph_capture_sizes: null
+ free_cache_engine: true
+ data_parallel_size: 1
+ expert_parallel_size: 1
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_model_len: null
+ max_num_seqs: 1024
+ load_format: auto
+ engine_kwargs: {}
+ limit_images: null
+ enable_chunked_prefill: true
+ enable_prefix_caching: true
+ disable_log_stats: true
+ skip_tokenizer_init: false
+ prompt_length: 2048
+ response_length: 2048
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ rollout_rs_threshold: null
+ bypass_mode: false
+ loss_type: ppo_clip
+ rollout_is_batch_normalize: false
+ _target_: verl.trainer.config.AlgoConfig
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ norm_adv_by_std_in_grpo: true
+ use_kl_in_reward: false
+ kl_penalty: kl
+ kl_ctrl:
+ _target_: verl.trainer.config.KLControlConfig
+ type: fixed
+ kl_coef: 0.001
+ horizon: 10000
+ target_kl: 0.1
+ use_pf_ppo: false
+ pf_ppo:
+ reweight_method: pow
+ weight_pow: 2.0
+custom_reward_function:
+ path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ name: compute_score
+trainer:
+ balance_batch: true
+ total_epochs: 15
+ total_training_steps: null
+ project_name: readctrl-verl
+ experiment_name: qwen3-4b-instruct-en
+ logger:
+ - console
+ - wandb
+ log_val_generations: 0
+ rollout_data_dir: null
+ validation_data_dir: null
+ nnodes: 1
+ n_gpus_per_node: 2
+ save_freq: 5
+ esi_redundant_time: 0
+ resume_mode: auto
+ resume_from_path: null
+ val_before_train: true
+ val_only: false
+ test_freq: 10
+ critic_warmup: 0
+ default_hdfs_dir: null
+ del_local_ckpt_after_load: false
+ default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ max_actor_ckpt_to_keep: 1
+ max_critic_ckpt_to_keep: 1
+ ray_wait_register_center_timeout: 300
+ device: cuda
+ use_legacy_worker_impl: auto
+ remove_previous_ckpt_in_save: true
+global_profiler:
+ _target_: verl.utils.profiler.ProfilerConfig
+ tool: null
+ steps: null
+ profile_continuous_steps: false
+ save_path: outputs/profile
+ global_tool_config:
+ nsys:
+ _target_: verl.utils.profiler.config.NsightToolConfig
+ discrete: false
+ controller_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ worker_nsight_options:
+ trace: cuda,nvtx,cublas,ucx
+ cuda-memory-usage: 'true'
+ cuda-graph-trace: graph
+ capture-range: cudaProfilerApi
+ capture-range-end: null
+ kill: none
+ torch_memory:
+ trace_alloc_max_entries: 100000
+ stack_depth: 32
+ context: all
+ stacks: all
+ kw_args: {}
+transfer_queue:
+ enable: false
+ray_kwargs:
+ ray_init:
+ num_cpus: null
+ timeline_json_file: null
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/hydra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fb5e8a3f8e7fcd48a17a4988f7055a143adf972
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/hydra.yaml
@@ -0,0 +1,212 @@
+hydra:
+ run:
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ sweep:
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+ subdir: ${hydra.job.num}
+ launcher:
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+ sweeper:
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+ max_batch_size: null
+ params: null
+ help:
+ app_name: ${hydra.job.name}
+ header: '${hydra.help.app_name} is powered by Hydra.
+
+ '
+ footer: 'Powered by Hydra (https://hydra.cc)
+
+ Use --hydra-help to view Hydra specific help
+
+ '
+ template: '${hydra.help.header}
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (group=option)
+
+
+ $APP_CONFIG_GROUPS
+
+
+ == Config ==
+
+ Override anything in the config (foo.bar=value)
+
+
+ $CONFIG
+
+
+ ${hydra.help.footer}
+
+ '
+ hydra_help:
+ template: 'Hydra (${hydra.runtime.version})
+
+ See https://hydra.cc for more info.
+
+
+ == Flags ==
+
+ $FLAGS_HELP
+
+
+ == Configuration groups ==
+
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+ to command line)
+
+
+ $HYDRA_CONFIG_GROUPS
+
+
+ Use ''--cfg hydra'' to Show the Hydra config.
+
+ '
+ hydra_help: ???
+ hydra_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][HYDRA] %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ root:
+ level: INFO
+ handlers:
+ - console
+ loggers:
+ logging_example:
+ level: DEBUG
+ disable_existing_loggers: false
+ job_logging:
+ version: 1
+ formatters:
+ simple:
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+ handlers:
+ console:
+ class: logging.StreamHandler
+ formatter: simple
+ stream: ext://sys.stdout
+ file:
+ class: logging.FileHandler
+ formatter: simple
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+ root:
+ level: INFO
+ handlers:
+ - console
+ - file
+ disable_existing_loggers: false
+ env: {}
+ mode: RUN
+ searchpath: []
+ callbacks: {}
+ output_subdir: .hydra
+ overrides:
+ hydra:
+ - hydra.mode=RUN
+ task:
+ - algorithm.adv_estimator=grpo
+ - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+ - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+ - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+ - data.train_batch_size=512
+ - data.max_prompt_length=1024
+ - data.max_response_length=2048
+ - data.filter_overlong_prompts=True
+ - data.truncation=error
+ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+ - actor_rollout_ref.actor.optim.lr=1e-6
+ - actor_rollout_ref.model.use_remove_padding=True
+ - actor_rollout_ref.actor.ppo_mini_batch_size=256
+ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+ - actor_rollout_ref.actor.use_kl_loss=True
+ - actor_rollout_ref.actor.kl_loss_coef=0.001
+ - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+ - actor_rollout_ref.actor.entropy_coeff=0
+ - actor_rollout_ref.model.enable_gradient_checkpointing=True
+ - actor_rollout_ref.actor.fsdp_config.param_offload=True
+ - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+ - actor_rollout_ref.rollout.name=vllm
+ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+ - actor_rollout_ref.rollout.enforce_eager=True
+ - actor_rollout_ref.rollout.max_model_len=8192
+ - actor_rollout_ref.rollout.n=3
+ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+ - actor_rollout_ref.ref.fsdp_config.param_offload=True
+ - algorithm.use_kl_in_reward=False
+ - trainer.critic_warmup=0
+ - trainer.logger=["console","wandb"]
+ - trainer.project_name=readctrl-verl
+ - trainer.experiment_name=qwen3-4b-instruct-en
+ - trainer.n_gpus_per_node=2
+ - trainer.nnodes=1
+ - trainer.save_freq=5
+ - trainer.test_freq=10
+ - +trainer.remove_previous_ckpt_in_save=true
+ - trainer.max_actor_ckpt_to_keep=1
+ - trainer.max_critic_ckpt_to_keep=1
+ - trainer.resume_mode=auto
+ - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+ - trainer.total_epochs=15
+ job:
+ name: main_ppo
+ chdir: null
+ override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+ id: ???
+ num: ???
+ config_name: ppo_trainer
+ env_set: {}
+ env_copy: []
+ config:
+ override_dirname:
+ kv_sep: '='
+ item_sep: ','
+ exclude_keys: []
+ runtime:
+ version: 1.3.2
+ version_base: '1.3'
+ cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+ config_sources:
+ - path: hydra.conf
+ schema: pkg
+ provider: hydra
+ - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+ schema: file
+ provider: main
+ - path: ''
+ schema: structured
+ provider: schema
+ output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56
+ choices:
+ algorithm@algorithm.rollout_correction: rollout_correction
+ reward_model: dp_reward_loop
+ critic: dp_critic
+ critic/../engine@critic.model.fsdp_config: fsdp
+ critic/../optim@critic.optim: fsdp
+ model@actor_rollout_ref.model: hf_model
+ rollout@actor_rollout_ref.rollout: rollout
+ ref@actor_rollout_ref.ref: dp_ref
+ ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+ data: legacy_data
+ actor@actor_rollout_ref.actor: dp_actor
+ actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+ actor/../optim@actor_rollout_ref.actor.optim: fsdp
+ hydra/env: default
+ hydra/callbacks: null
+ hydra/job_logging: default
+ hydra/hydra_logging: default
+ hydra/hydra_help: default
+ hydra/help: default
+ hydra/sweeper: basic
+ hydra/launcher: basic
+ hydra/output: default
+ verbose: false
diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/overrides.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/overrides.yaml
@@ -0,0 +1,45 @@
+- algorithm.adv_estimator=grpo
+- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+- data.train_batch_size=512
+- data.max_prompt_length=1024
+- data.max_response_length=2048
+- data.filter_overlong_prompts=True
+- data.truncation=error
+- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+- actor_rollout_ref.actor.optim.lr=1e-6
+- actor_rollout_ref.model.use_remove_padding=True
+- actor_rollout_ref.actor.ppo_mini_batch_size=256
+- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+- actor_rollout_ref.actor.use_kl_loss=True
+- actor_rollout_ref.actor.kl_loss_coef=0.001
+- actor_rollout_ref.actor.kl_loss_type=low_var_kl
+- actor_rollout_ref.actor.entropy_coeff=0
+- actor_rollout_ref.model.enable_gradient_checkpointing=True
+- actor_rollout_ref.actor.fsdp_config.param_offload=True
+- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.rollout.tensor_model_parallel_size=1
+- actor_rollout_ref.rollout.name=vllm
+- actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+- actor_rollout_ref.rollout.enforce_eager=True
+- actor_rollout_ref.rollout.max_model_len=8192
+- actor_rollout_ref.rollout.n=3
+- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+- actor_rollout_ref.ref.fsdp_config.param_offload=True
+- algorithm.use_kl_in_reward=False
+- trainer.critic_warmup=0
+- trainer.logger=["console","wandb"]
+- trainer.project_name=readctrl-verl
+- trainer.experiment_name=qwen3-4b-instruct-en
+- trainer.n_gpus_per_node=2
+- trainer.nnodes=1
+- trainer.save_freq=5
+- trainer.test_freq=10
+- +trainer.remove_previous_ckpt_in_save=true
+- trainer.max_actor_ckpt_to_keep=1
+- trainer.max_critic_ckpt_to_keep=1
+- trainer.resume_mode=auto
+- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+- trainer.total_epochs=15